def post(self, request): try: node_uuid = request.data["node_uuid"] except KeyError: return HttpResponseBadRequest("`node_uuid` required") identity_id = request.data.get("identity_id") if settings.REFINERY_DEPLOYMENT_PLATFORM == "aws" and not identity_id: return HttpResponseBadRequest("`identity_id` required") elif settings.REFINERY_DEPLOYMENT_PLATFORM != "aws" and identity_id: return HttpResponseBadRequest("`identity_id` not permitted for " "non-AWS deployments") try: node = Node.objects.get(uuid=node_uuid) except Node.DoesNotExist: logger.error("Node with UUID '%s' does not exist", node_uuid) return HttpResponseNotFound() except Node.MultipleObjectsReturned: logger.critical("Multiple Nodes found with UUID '%s'", node_uuid) return HttpResponseServerError() if request.user != node.study.get_dataset().get_owner(): return HttpResponseForbidden() file_store_item = node.get_file_store_item() if (file_store_item and not file_store_item.datafile and file_store_item.source.startswith( (settings.REFINERY_DATA_IMPORT_DIR, 's3://') )): logger.debug("Adding file to Node '%s'", node) file_store_item.source = os.path.basename(file_store_item.source) file_store_item.save() if identity_id: file_source_translator = generate_file_source_translator( identity_id=identity_id ) else: file_source_translator = generate_file_source_translator( username=request.user.username ) translated_datafile_source = file_source_translator( file_store_item.source ) file_store_item.source = translated_datafile_source # Remove the FileStoreItem's import_task_id to treat it as a # brand new file import task when called below. # We then have to update its Node's Solr index entry, so the # updated file import status is available in the UI. file_store_item.import_task_id = "" file_store_item.save() node.update_solr_index() FileImportTask().delay(file_store_item.uuid) return HttpResponse(status=202) # Accepted
def post(self, request, *args, **kwargs): if not request.is_ajax() or not request.body: return HttpResponseBadRequest() file_data = json.loads(request.body) try: base_path = file_data["base_path"] except KeyError: base_path = "" bad_file_list = [] translate_file_source = generate_file_source_translator( username=request.user.username, base_path=base_path) # check if files are available try: for file_path in file_data["list"]: if not isinstance(file_path, str): bad_file_list.append(file_path) else: file_path = translate_file_source(file_path) if not os.path.exists(file_path): bad_file_list.append(file_path) logger.debug("Checked file path: '%s'", file_path) except KeyError: # if there's no list provided return HttpResponseBadRequest() # prefix output to protect from JSON vulnerability (stripped by # Angular) return HttpResponse(")]}',\n" + json.dumps(bad_file_list), content_type="application/json")
def post(self, request, *args, **kwargs): if not request.is_ajax() or not request.body: return HttpResponseBadRequest() file_data = json.loads(request.body) try: base_path = file_data["base_path"] except KeyError: base_path = "" bad_file_list = [] translate_file_source = generate_file_source_translator( username=request.user.username, base_path=base_path) # check if files are available try: for file_path in file_data["list"]: # Explicitly check if file_path here is a string or unicode # string if not isinstance(file_path, unicode): bad_file_list.append(file_path) else: file_path = translate_file_source(file_path) if not os.path.exists(file_path): bad_file_list.append(file_path) logger.debug("Checked file path: '%s'", file_path) except KeyError: # if there's no list provided return HttpResponseBadRequest() # prefix output to protect from JSON vulnerability (stripped by # Angular) return HttpResponse(")]}',\n" + json.dumps(bad_file_list), content_type="application/json")
def test_translate_from_relative_path_without_base_path(self): translate_file_source = \ generate_file_source_translator(username=self.username) source = translate_file_source(self.rel_path_source) self.assertEqual(source, os.path.join( settings.REFINERY_DATA_IMPORT_DIR, self.username, self.rel_path_source))
def post(self, request, *args, **kwargs): if not request.is_ajax() or not request.body: return HttpResponseBadRequest() try: file_data = json.loads(request.body) except ValueError: return HttpResponseBadRequest() try: input_file_list = file_data['list'] except KeyError: return HttpResponseBadRequest() try: base_path = file_data['base_path'] except KeyError: base_path = None try: identity_id = file_data['identity_id'] except KeyError: identity_id = None bad_file_list = [] translate_file_source = generate_file_source_translator( username=request.user.username, base_path=base_path, identity_id=identity_id) if settings.REFINERY_DEPLOYMENT_PLATFORM == 'aws': # get a list of all uploaded S3 objects for the user uploaded_s3_key_list = [] s3 = boto3.resource('s3') s3_bucket = s3.Bucket(settings.MEDIA_BUCKET) for s3_object in s3_bucket.objects.filter( Prefix='uploads/{}'.format(identity_id)): uploaded_s3_key_list.append(s3_object.key) for input_file_path in input_file_list: if not isinstance(input_file_path, unicode): bad_file_list.append(input_file_path) else: input_file_path = translate_file_source(input_file_path) if settings.REFINERY_DEPLOYMENT_PLATFORM == 'aws': # check if S3 object key exists bucket_name, key = parse_s3_url(input_file_path) if key not in uploaded_s3_key_list: bad_file_list.append(os.path.basename(key)) else: # POSIX file system if not os.path.exists(input_file_path): bad_file_list.append(input_file_path) logger.debug("Checked file path: '%s'", input_file_path) # prefix output to protect from JSON vulnerability (stripped by # Angular) return HttpResponse(")]}',\n" + json.dumps(bad_file_list), content_type="application/json")
def process_metadata_table(username, title, metadata_file, source_columns, data_file_column, auxiliary_file_column=None, base_path="", data_file_permanent=False, species_column=None, genome_build_column=None, annotation_column=None, sample_column=None, assay_column=None, slug=None, is_public=False): """Create a dataset given a metadata file object and its description :param username: username :type username: str :param title: dataset name :type title: str :param metadata_file: metadata file in tab-delimited format :type metadata_file: file :param source_columns: a list of source column indices :type source_columns: list of ints :param data_file_column: data file column index :type data_file_column: int :param data_file_permanent: should data files be imported :type data_file_permanent: bool :param base_path: path to append to data file :type base_path: str :param auxiliary_file_column: auxiliary file column index :type auxiliary_file_column: int :param species_column: species column index :type species_column: int :param genome_build_column: genome build column index :type genome_build_column: int :param annotation_column: annotation column index :type annotation_column: int :param slug: dataset name shortcut :type slug: str :param is_public: is dataset available to public :type is_public: bool :returns: UUID of the new dataset """ try: source_columns = [abs(int(x)) for x in source_columns] except ValueError as exc: logger.error(exc) raise ValueError("source column indices must be integers") try: data_file_column = int(data_file_column) except ValueError as exc: logger.error(exc) raise ValueError("data file column index must be an integer") try: auxiliary_file_column = int(auxiliary_file_column) except (TypeError, ValueError): auxiliary_file_column = None try: base_path = str(base_path) except ValueError: base_path = "" try: species_column = int(species_column) except (TypeError, ValueError): species_column = None try: genome_build_column = int(genome_build_column) except (TypeError, ValueError): genome_build_column = None try: annotation_column = int(annotation_column) except (TypeError, ValueError): annotation_column = None try: sample_column = int(sample_column) except (TypeError, ValueError): sample_column = None try: assay_column = int(assay_column) except (TypeError, ValueError): assay_column = None try: slug = str(slug) except ValueError: slug = None data_file_permanent = bool(data_file_permanent) is_public = bool(is_public) file_source_translator = generate_file_source_translator( username=username, base_path=base_path) parser = SingleFileColumnParser( metadata_file=metadata_file, file_source_translator=file_source_translator, source_column_index=source_columns, data_file_column_index=data_file_column, auxiliary_file_column_index=auxiliary_file_column, file_base_path=base_path, data_file_permanent=data_file_permanent, species_column_index=species_column, genome_build_column_index=genome_build_column, annotation_column_index=annotation_column, sample_column_index=sample_column, assay_column_index=assay_column, column_index_separator="/") investigation = parser.run() investigation.title = title investigation.save() return create_dataset( investigation_uuid=investigation.uuid, username=username, dataset_name=title, slug=slug, public=is_public)
def test_translate_from_relative_path_with_base_bath(self): translate_file_source = \ generate_file_source_translator(base_path=self.base_path) source = translate_file_source(self.rel_path_source) self.assertEqual(source, os.path.join(self.base_path, self.rel_path_source))
def test_translate_from_absolute_path(self): translate_file_source = generate_file_source_translator() source = translate_file_source(self.abs_path_source) self.assertEqual(source, self.abs_path_source)
def test_translate_from_url(self): translate_file_source = generate_file_source_translator() source = translate_file_source(self.url_source) self.assertEqual(source, self.url_source)
def test_translate_with_map(self): settings.REFINERY_FILE_SOURCE_MAP = {self.url_prefix: '/new/path/'} translate_file_source = generate_file_source_translator() source = translate_file_source(self.url_source) self.assertEqual(source, os.path.join('/new/path/', self.filename))
def post(self, request, *args, **kwargs): existing_data_set_uuid = request.GET.get('data_set_uuid') existing_datafile_names = [] if existing_data_set_uuid: data_set = get_object_or_404(DataSet, uuid=existing_data_set_uuid) investigation = data_set.get_investigation() existing_datafile_names = investigation.get_datafile_names( local_only=True, exclude_metadata_file=True ) if not request.is_ajax() or not request.body: return HttpResponseBadRequest() try: file_data = json.loads(request.body) except ValueError: return HttpResponseBadRequest() try: input_file_list = file_data['list'] except KeyError: return HttpResponseBadRequest() try: base_path = file_data['base_path'] except KeyError: base_path = None try: identity_id = file_data['identity_id'] except KeyError: identity_id = None bad_file_list = [] translate_file_source = generate_file_source_translator( username=request.user.username, base_path=base_path, identity_id=identity_id ) uploaded_s3_key_list = [] if settings.REFINERY_DEPLOYMENT_PLATFORM == 'aws': # get a list of all uploaded S3 objects for the user s3 = boto3.resource('s3') s3_bucket = s3.Bucket(settings.UPLOAD_BUCKET) # TODO: handle ParamValidationError (return error msg in response?) for s3_object in s3_bucket.objects.filter(Prefix=identity_id): uploaded_s3_key_list.append(s3_object.key) for input_file_path in input_file_list: if not isinstance(input_file_path, unicode): bad_file_list.append(input_file_path) logger.error("Uploaded file path '%s' is not a string", input_file_path) else: input_file_path = translate_file_source(input_file_path) if settings.REFINERY_DEPLOYMENT_PLATFORM == 'aws': # check if S3 object key exists bucket_name, key = parse_s3_url(input_file_path) if key not in uploaded_s3_key_list: bad_file_list.append(os.path.basename(key)) logger.debug("Object key '%s' does not exist in '%s'", key, bucket_name) else: logger.debug("Object key '%s' exists in '%s'", key, bucket_name) else: # POSIX file system if not os.path.exists(input_file_path): bad_file_list.append(os.path.basename(input_file_path)) logger.debug( "File '%s' does not exist", input_file_path ) else: logger.debug("File '%s' exists", input_file_path) response_data = { "data_files_not_uploaded": [ file_name for file_name in bad_file_list if file_name not in existing_datafile_names ], "data_files_to_be_deleted": [ file_name for file_name in existing_datafile_names if file_name not in bad_file_list ] } return JsonResponse(response_data)
def parse_isatab(username, public, path, identity_id=None, additional_raw_data_file_extension=None, isa_archive=None, pre_isa_archive=None, file_base_path=None, overwrite=False): """parses in an ISA-TAB file to create database entries and creates or updates a dataset for the investigation to belong to; returns the dataset UUID or None if something went wrong. Use like this: parse_isatab(username, is_public, folder_name, additional_raw_data_file_extension, isa_archive=<path>, pre_isa_archive=<path>, file_base_path=<path> Parameters: username: username of the person the dataset will belong to public: boolean that determines if the dataset is public or not path: absolute path of the ISA-Tab file to parse additional_raw_data_file_extension: an optional argument that will append a suffix to items in Raw Data File as need be isa_archive: if you're passing a directory, a zipped version of the directory for storage and legacy purposes pre_isa_archive: optional copy of files that were converted to ISA-Tab file_base_path: if your file locations are relative paths, this is the base """ file_source_translator = generate_file_source_translator( username=username, base_path=file_base_path, identity_id=identity_id) parser = IsaTabParser( file_source_translator=file_source_translator, additional_raw_data_file_extension=additional_raw_data_file_extension, ) """Get the study title and investigation id and see if anything is in the database and if so compare the checksum """ # 1. First check whether the user exists try: user = User.objects.get(username__exact=username) except (User.DoesNotExist, User.MultipleObjectsReturned): user = None # 2. If user exists we need to quickly get the dataset title to see if its # already in the DB if user: checksum = None (identifier, title) = parser.get_dataset_name(path) if identifier is None or title is None: datasets = [] else: dataset_title = "%s: %s" % (identifier, title) datasets = DataSet.objects.filter(name=dataset_title) # check if the investigation already exists if len(datasets): # if not 0, update dataset with new investigation # go through datasets until you find one with the correct owner for ds in datasets: own = ds.get_owner() if own == user: if overwrite: # Remove the existing data set first checksum = False ds.delete() else: # 3. Finally we need to get the checksum so that we can # compare that to our given file. investigation = ds.get_investigation() fileStoreItem = FileStoreItem.objects.get( uuid=investigation.isarchive_file) if fileStoreItem: try: logger.info("Get file: %s", fileStoreItem.get_absolute_path()) checksum = calculate_checksum( fileStoreItem.get_file_object()) except IOError as exc: logger.error( "Original ISA-tab archive wasn't found. " "Error: '%s'", exc) # 4. Finally if we got a checksum for an existing file, we calculate # the checksum for the new file and compare them if checksum: new_checksum = None # TODO: error handling with open(path, 'rb') as f: new_checksum = calculate_checksum(f) if checksum == new_checksum: # Checksums are identical so we can skip this file. logger.info("The checksum of both files is the same: %s", checksum) return \ investigation.investigationlink_set.all()[0].data_set.uuid with transaction.atomic(): investigation = parser.run(path, isa_archive=isa_archive, preisa_archive=pre_isa_archive) data_uuid = create_dataset(investigation.uuid, username, public=public) return data_uuid
def parse_isatab(username, public, path, identity_id=None, additional_raw_data_file_extension=None, isa_archive=None, pre_isa_archive=None, file_base_path=None, overwrite=False, existing_data_set_uuid=None): """parses in an ISA-TAB file to create database entries and creates or updates a dataset for the investigation to belong to; returns the dataset UUID or None if something went wrong. Use like this: parse_isatab(username, is_public, folder_name, additional_raw_data_file_extension, isa_archive=<path>, pre_isa_archive=<path>, file_base_path=<path> Parameters: username: username of the person the dataset will belong to public: boolean that determines if the dataset is public or not path: absolute path of the ISA-Tab file to parse additional_raw_data_file_extension: an optional argument that will append a suffix to items in Raw Data File as need be isa_archive: if you're passing a directory, a zipped version of the directory for storage and legacy purposes pre_isa_archive: optional copy of files that were converted to ISA-Tab file_base_path: if your file locations are relative paths, this is the base existing_data_set_uuid: UUID of an existing DataSet that a metadata revision is to be performed upon """ file_source_translator = generate_file_source_translator( username=username, base_path=file_base_path, identity_id=identity_id ) parser = IsaTabParser( file_source_translator=file_source_translator, additional_raw_data_file_extension=additional_raw_data_file_extension, ) """Get the study title and investigation id and see if anything is in the database and if so compare the checksum """ # 1. First check whether the user exists try: user = User.objects.get(username__exact=username) except (User.DoesNotExist, User.MultipleObjectsReturned): user = None # 2. If user exists we need to quickly get the dataset title to see if its # already in the DB if user: checksum = None (identifier, title) = parser.get_dataset_name(path) if identifier is None or title is None: datasets = [] else: dataset_title = "%s: %s" % (identifier, title) datasets = DataSet.objects.filter(name=dataset_title) # check if the investigation already exists # if not 0, update dataset with new investigation if len(datasets) and not existing_data_set_uuid: # go through datasets until you find one with the correct owner for ds in datasets: own = ds.get_owner() if own == user: if overwrite: # Remove the existing data set first checksum = False ds.delete() else: # 3. Finally we need to get the checksum so that we can # compare that to our given file. investigation = ds.get_investigation() fileStoreItem = FileStoreItem.objects.get( uuid=investigation.isarchive_file) if fileStoreItem: try: logger.info("Get file: %s", fileStoreItem) checksum = calculate_checksum( fileStoreItem.datafile ) except IOError as exc: logger.error( "Original ISA-tab archive wasn't found. " "Error: '%s'", exc ) # 4. Finally if we got a checksum for an existing file, we calculate # the checksum for the new file and compare them if checksum: new_checksum = None # TODO: error handling with open(path, 'rb') as f: new_checksum = calculate_checksum(f) if checksum == new_checksum: # Checksums are identical so we can skip this file. logger.info("The checksum of both files is the same: %s", checksum) return \ investigation.investigationlink_set.all()[0].data_set.uuid with transaction.atomic(): investigation = parser.run( path, isa_archive=isa_archive, preisa_archive=pre_isa_archive ) if existing_data_set_uuid: data_set = DataSet.objects.get(uuid=existing_data_set_uuid) data_set.update_with_revised_investigation(investigation) return existing_data_set_uuid data_set_uuid = create_dataset( investigation.uuid, username, public=public ) return data_set_uuid
def test_translate_from_relative_path_without_username_or_base_path(self): translate_file_source = generate_file_source_translator() with self.assertRaises(ValueError): translate_file_source(self.rel_path_source)
def process_metadata_table( username, title, metadata_file, source_columns, data_file_column, auxiliary_file_column=None, base_path="", data_file_permanent=False, species_column=None, genome_build_column=None, annotation_column=None, sample_column=None, assay_column=None, is_public=False, delimiter="comma", custom_delimiter_string=",", identity_id=None, existing_data_set_uuid=None ): """Create a dataset given a metadata file object and its description :param username: username :type username: str :param title: dataset name :type title: str :param metadata_file: metadata file in tab-delimited format :type metadata_file: file :param source_columns: a list of source column indices :type source_columns: list of ints :param data_file_column: data file column index :type data_file_column: int :param data_file_permanent: should data files be imported :type data_file_permanent: bool :param base_path: path to append to data file :type base_path: str :param auxiliary_file_column: auxiliary file column index :type auxiliary_file_column: int :param species_column: species column index :type species_column: int :param genome_build_column: genome build column index :type genome_build_column: int :param annotation_column: annotation column index :type annotation_column: int :param is_public: is dataset available to public :type is_public: bool :param existing_data_set_uuid: UUID of an existing DataSet that a metadata revision is to be performed upon :returns: UUID of the new dataset """ try: source_columns = [abs(int(x)) for x in source_columns] except ValueError as exc: logger.error(exc) raise ValueError("source column indices must be integers") try: data_file_column = int(data_file_column) except ValueError as exc: logger.error(exc) raise ValueError("data file column index must be an integer") try: auxiliary_file_column = int(auxiliary_file_column) except (TypeError, ValueError): auxiliary_file_column = None try: base_path = str(base_path) except ValueError: base_path = "" try: species_column = int(species_column) except (TypeError, ValueError): species_column = None try: genome_build_column = int(genome_build_column) except (TypeError, ValueError): genome_build_column = None try: annotation_column = int(annotation_column) except (TypeError, ValueError): annotation_column = None try: sample_column = int(sample_column) except (TypeError, ValueError): sample_column = None try: assay_column = int(assay_column) except (TypeError, ValueError): assay_column = None try: delimiter = str(delimiter) except ValueError: delimiter = "comma" try: custom_delimiter_string = str(custom_delimiter_string) except ValueError: custom_delimiter_string = "," data_file_permanent = bool(data_file_permanent) is_public = bool(is_public) file_source_translator = generate_file_source_translator( username=username, base_path=base_path, identity_id=identity_id ) # TODO: From here on should be run within a transaction as to not commit # things to the db on an import failure, but doing so doesn't allow for # the association of uploaded datafiles parser = SingleFileColumnParser( metadata_file=metadata_file, file_source_translator=file_source_translator, source_column_index=source_columns, data_file_column_index=data_file_column, auxiliary_file_column_index=auxiliary_file_column, file_base_path=base_path, data_file_permanent=data_file_permanent, species_column_index=species_column, genome_build_column_index=genome_build_column, annotation_column_index=annotation_column, sample_column_index=sample_column, assay_column_index=assay_column, column_index_separator="/", delimiter=delimiter, custom_delimiter_string=custom_delimiter_string ) investigation = parser.run() investigation.title = title investigation.save() if existing_data_set_uuid: data_set = DataSet.objects.get(uuid=existing_data_set_uuid) data_set.update_with_revised_investigation(investigation) return existing_data_set_uuid return create_dataset( investigation_uuid=investigation.uuid, username=username, dataset_name=title, public=is_public )
def parse_isatab(username, public, path, identity_id=None, additional_raw_data_file_extension=None, isa_archive=None, pre_isa_archive=None, file_base_path=None, overwrite=False, existing_data_set_uuid=None): """parses in an ISA-TAB file to create database entries and creates or updates a dataset for the investigation to belong to; returns the dataset UUID or None if something went wrong. Use like this: parse_isatab(username, is_public, folder_name, additional_raw_data_file_extension, isa_archive=<path>, pre_isa_archive=<path>, file_base_path=<path> Parameters: username: username of the person the dataset will belong to public: boolean that determines if the dataset is public or not path: absolute path of the ISA-Tab file to parse additional_raw_data_file_extension: an optional argument that will append a suffix to items in Raw Data File as need be isa_archive: if you're passing a directory, a zipped version of the directory for storage and legacy purposes pre_isa_archive: optional copy of files that were converted to ISA-Tab file_base_path: if your file locations are relative paths, this is the base existing_data_set_uuid: UUID of an existing DataSet that a metadata revision is to be performed upon """ file_source_translator = generate_file_source_translator( username=username, base_path=file_base_path, identity_id=identity_id) parser = IsaTabParser( file_source_translator=file_source_translator, additional_raw_data_file_extension=additional_raw_data_file_extension, ) """Get the study title and investigation id and see if anything is in the database and if so compare the checksum """ # 1. First check whether the user exists try: user = User.objects.get(username__exact=username) except (User.DoesNotExist, User.MultipleObjectsReturned): user = None # 2. If user exists we need to quickly get the dataset title to see if its # already in the DB if user: checksum = None (identifier, title) = parser.get_dataset_name(path) if identifier is None or title is None: datasets = [] else: dataset_title = "%s: %s" % (identifier, title) datasets = DataSet.objects.filter(name=dataset_title) # check if the investigation already exists # if not 0, update dataset with new investigation if len(datasets) and not existing_data_set_uuid: # go through datasets until you find one with the correct owner for ds in datasets: own = ds.get_owner() if own == user: if overwrite: # Remove the existing data set first checksum = False ds.delete() else: # 3. Finally we need to get the checksum so that we can # compare that to our given file. investigation = ds.get_investigation() try: """isaarchive_file should be a uuid foreign key upon creation of either FileStoreItem or Investigation in isa_tab_parser.py""" file_store_item = FileStoreItem.objects.get( uuid=investigation.isarchive_file) logger.info("Get file: %s", file_store_item) # will fail later on when the .datafile is accessed except (FileStoreItem.DoesNotExist, FileStoreItem.MultipleObjectsReturned) as e: logger.error( 'Did not get FileStoreItem for uuid %s', unicode(investigation.isarchive_file), e) try: checksum = calculate_checksum( file_store_item.datafile) except (EnvironmentError, botocore.exceptions.BotoCoreError, botocore.exceptions.ClientError) as exc: logger.error( "Original ISA-tab archive was not found: %s", exc) # 4. Finally if we got a checksum for an existing file, we calculate # the checksum for the new file and compare them if checksum: new_checksum = None # TODO: error handling with open(path, 'rb') as f: new_checksum = calculate_checksum(f) if checksum == new_checksum: # Checksums are identical so we can skip this file. logger.info("The checksum of both files is the same: %s", checksum) return \ investigation.investigationlink_set.all()[0].data_set.uuid with transaction.atomic(): investigation = parser.run(path, isa_archive=isa_archive, preisa_archive=pre_isa_archive) if existing_data_set_uuid: try: data_set = DataSet.objects.get(uuid=existing_data_set_uuid) except (DataSet.DoesNotExist, DataSet.MultipleObjectsReturned) as e: logger.error( 'DataSet for uuid %s not fetched and thus not ' 'updated with revised investigation %s: %s', existing_data_set_uuid, unicode(investigation), e) raise type(e)( 'DataSet for uuid %s not fetched and thus not ' 'updated with revised investigation {}: {}'.format( existing_data_set_uuid, unicode(investigation))) else: data_set.update_with_revised_investigation(investigation) return existing_data_set_uuid data_set_uuid = create_dataset(investigation.uuid, username, public=public) return data_set_uuid
def parse(self, dir_name): file_source_translator = generate_file_source_translator( username=self.user.username) dir = os.path.join(TEST_DATA_BASE_PATH, dir_name) return IsaTabParser( file_source_translator=file_source_translator).run(dir)
def process_metadata_table(username, title, metadata_file, source_columns, data_file_column, auxiliary_file_column=None, base_path="", data_file_permanent=False, species_column=None, genome_build_column=None, annotation_column=None, sample_column=None, assay_column=None, slug=None, is_public=False, delimiter="comma", custom_delimiter_string=",", identity_id=None): """Create a dataset given a metadata file object and its description :param username: username :type username: str :param title: dataset name :type title: str :param metadata_file: metadata file in tab-delimited format :type metadata_file: file :param source_columns: a list of source column indices :type source_columns: list of ints :param data_file_column: data file column index :type data_file_column: int :param data_file_permanent: should data files be imported :type data_file_permanent: bool :param base_path: path to append to data file :type base_path: str :param auxiliary_file_column: auxiliary file column index :type auxiliary_file_column: int :param species_column: species column index :type species_column: int :param genome_build_column: genome build column index :type genome_build_column: int :param annotation_column: annotation column index :type annotation_column: int :param slug: dataset name shortcut :type slug: str :param is_public: is dataset available to public :type is_public: bool :returns: UUID of the new dataset """ try: source_columns = [abs(int(x)) for x in source_columns] except ValueError as exc: logger.error(exc) raise ValueError("source column indices must be integers") try: data_file_column = int(data_file_column) except ValueError as exc: logger.error(exc) raise ValueError("data file column index must be an integer") try: auxiliary_file_column = int(auxiliary_file_column) except (TypeError, ValueError): auxiliary_file_column = None try: base_path = str(base_path) except ValueError: base_path = "" try: species_column = int(species_column) except (TypeError, ValueError): species_column = None try: genome_build_column = int(genome_build_column) except (TypeError, ValueError): genome_build_column = None try: annotation_column = int(annotation_column) except (TypeError, ValueError): annotation_column = None try: sample_column = int(sample_column) except (TypeError, ValueError): sample_column = None try: assay_column = int(assay_column) except (TypeError, ValueError): assay_column = None try: if slug: slug = str(slug) except ValueError: slug = None try: delimiter = str(delimiter) except ValueError: delimiter = "comma" try: custom_delimiter_string = str(custom_delimiter_string) except ValueError: custom_delimiter_string = "," data_file_permanent = bool(data_file_permanent) is_public = bool(is_public) file_source_translator = generate_file_source_translator( username=username, base_path=base_path, identity_id=identity_id) parser = SingleFileColumnParser( metadata_file=metadata_file, file_source_translator=file_source_translator, source_column_index=source_columns, data_file_column_index=data_file_column, auxiliary_file_column_index=auxiliary_file_column, file_base_path=base_path, data_file_permanent=data_file_permanent, species_column_index=species_column, genome_build_column_index=genome_build_column, annotation_column_index=annotation_column, sample_column_index=sample_column, assay_column_index=assay_column, column_index_separator="/", delimiter=delimiter, custom_delimiter_string=custom_delimiter_string) investigation = parser.run() investigation.title = title investigation.save() return create_dataset(investigation_uuid=investigation.uuid, username=username, dataset_name=title, slug=slug, public=is_public)
def process_metadata_table(username, title, metadata_file, source_columns, data_file_column, auxiliary_file_column=None, base_path="", data_file_permanent=False, species_column=None, genome_build_column=None, annotation_column=None, sample_column=None, assay_column=None, is_public=False, delimiter="comma", custom_delimiter_string=",", identity_id=None, existing_data_set_uuid=None): """Create a dataset given a metadata file object and its description :param username: username :type username: str :param title: dataset name :type title: str :param metadata_file: metadata file in tab-delimited format :type metadata_file: file :param source_columns: a list of source column indices :type source_columns: list of ints :param data_file_column: data file column index :type data_file_column: int :param data_file_permanent: should data files be imported :type data_file_permanent: bool :param base_path: path to append to data file :type base_path: str :param auxiliary_file_column: auxiliary file column index :type auxiliary_file_column: int :param species_column: species column index :type species_column: int :param genome_build_column: genome build column index :type genome_build_column: int :param annotation_column: annotation column index :type annotation_column: int :param is_public: is dataset available to public :type is_public: bool :param existing_data_set_uuid: UUID of an existing DataSet that a metadata revision is to be performed upon :returns: UUID of the new dataset """ try: source_columns = [abs(int(x)) for x in source_columns] except ValueError as exc: logger.error(exc) raise ValueError("source column indices must be integers") try: data_file_column = int(data_file_column) except ValueError as exc: logger.error(exc) raise ValueError("data file column index must be an integer") try: auxiliary_file_column = int(auxiliary_file_column) except (TypeError, ValueError): auxiliary_file_column = None try: base_path = str(base_path) except ValueError: base_path = "" try: species_column = int(species_column) except (TypeError, ValueError): species_column = None try: genome_build_column = int(genome_build_column) except (TypeError, ValueError): genome_build_column = None try: annotation_column = int(annotation_column) except (TypeError, ValueError): annotation_column = None try: sample_column = int(sample_column) except (TypeError, ValueError): sample_column = None try: assay_column = int(assay_column) except (TypeError, ValueError): assay_column = None try: delimiter = str(delimiter) except ValueError: delimiter = "comma" try: custom_delimiter_string = str(custom_delimiter_string) except ValueError: custom_delimiter_string = "," data_file_permanent = bool(data_file_permanent) is_public = bool(is_public) file_source_translator = generate_file_source_translator( username=username, base_path=base_path, identity_id=identity_id) # TODO: From here on should be run within a transaction as to not commit # things to the db on an import failure, but doing so doesn't allow for # the association of uploaded datafiles parser = SingleFileColumnParser( metadata_file=metadata_file, file_source_translator=file_source_translator, source_column_index=source_columns, data_file_column_index=data_file_column, auxiliary_file_column_index=auxiliary_file_column, file_base_path=base_path, data_file_permanent=data_file_permanent, species_column_index=species_column, genome_build_column_index=genome_build_column, annotation_column_index=annotation_column, sample_column_index=sample_column, assay_column_index=assay_column, column_index_separator="/", delimiter=delimiter, custom_delimiter_string=custom_delimiter_string) investigation = parser.run() investigation.title = title investigation.save() if existing_data_set_uuid: try: data_set = DataSet.objects.get(uuid=existing_data_set_uuid) except (DataSet.DoesNotExist, DataSet.MultipleObjectsReturned) as e: logger.error( 'DataSet for uuid %s not fetched and thus not ' 'updated with revised investigation %s: %s', existing_data_set_uuid, unicode(investigation), e) raise type(e)('DataSet for uuid {} not fetched and thus not ' 'updated with revised investigation {}'.format( existing_data_set_uuid, unicode(investigation))) else: data_set.update_with_revised_investigation(investigation) return existing_data_set_uuid return create_dataset(investigation_uuid=investigation.uuid, username=username, dataset_name=title, public=is_public)