Пример #1
0
    def post(self, request):
        try:
            node_uuid = request.data["node_uuid"]
        except KeyError:
            return HttpResponseBadRequest("`node_uuid` required")

        identity_id = request.data.get("identity_id")
        if settings.REFINERY_DEPLOYMENT_PLATFORM == "aws" and not identity_id:
            return HttpResponseBadRequest("`identity_id` required")
        elif settings.REFINERY_DEPLOYMENT_PLATFORM != "aws" and identity_id:
            return HttpResponseBadRequest("`identity_id` not permitted for "
                                          "non-AWS deployments")

        try:
            node = Node.objects.get(uuid=node_uuid)
        except Node.DoesNotExist:
            logger.error("Node with UUID '%s' does not exist", node_uuid)
            return HttpResponseNotFound()
        except Node.MultipleObjectsReturned:
            logger.critical("Multiple Nodes found with UUID '%s'", node_uuid)
            return HttpResponseServerError()

        if request.user != node.study.get_dataset().get_owner():
            return HttpResponseForbidden()

        file_store_item = node.get_file_store_item()
        if (file_store_item and not file_store_item.datafile and
                file_store_item.source.startswith(
                    (settings.REFINERY_DATA_IMPORT_DIR, 's3://')
                )):
            logger.debug("Adding file to Node '%s'", node)

            file_store_item.source = os.path.basename(file_store_item.source)
            file_store_item.save()

            if identity_id:
                file_source_translator = generate_file_source_translator(
                    identity_id=identity_id
                )
            else:
                file_source_translator = generate_file_source_translator(
                    username=request.user.username
                )
            translated_datafile_source = file_source_translator(
                file_store_item.source
            )
            file_store_item.source = translated_datafile_source

            # Remove the FileStoreItem's import_task_id to treat it as a
            # brand new file import task when called below.
            # We then have to update its Node's Solr index entry, so the
            # updated file import status is available in the UI.
            file_store_item.import_task_id = ""
            file_store_item.save()
            node.update_solr_index()
            FileImportTask().delay(file_store_item.uuid)

        return HttpResponse(status=202)  # Accepted
Пример #2
0
    def post(self, request, *args, **kwargs):
        if not request.is_ajax() or not request.body:
            return HttpResponseBadRequest()

        file_data = json.loads(request.body)
        try:
            base_path = file_data["base_path"]
        except KeyError:
            base_path = ""

        bad_file_list = []
        translate_file_source = generate_file_source_translator(
            username=request.user.username, base_path=base_path)
        # check if files are available
        try:
            for file_path in file_data["list"]:
                if not isinstance(file_path, str):
                    bad_file_list.append(file_path)
                else:
                    file_path = translate_file_source(file_path)
                    if not os.path.exists(file_path):
                        bad_file_list.append(file_path)
                logger.debug("Checked file path: '%s'", file_path)
        except KeyError:  # if there's no list provided
            return HttpResponseBadRequest()
        # prefix output to protect from JSON vulnerability (stripped by
        # Angular)
        return HttpResponse(")]}',\n" + json.dumps(bad_file_list),
                            content_type="application/json")
Пример #3
0
    def post(self, request, *args, **kwargs):
        if not request.is_ajax() or not request.body:
            return HttpResponseBadRequest()

        file_data = json.loads(request.body)
        try:
            base_path = file_data["base_path"]
        except KeyError:
            base_path = ""

        bad_file_list = []
        translate_file_source = generate_file_source_translator(
            username=request.user.username, base_path=base_path)
        # check if files are available
        try:
            for file_path in file_data["list"]:
                # Explicitly check if file_path here is a string or unicode
                # string
                if not isinstance(file_path, unicode):
                    bad_file_list.append(file_path)
                else:
                    file_path = translate_file_source(file_path)
                    if not os.path.exists(file_path):
                        bad_file_list.append(file_path)
                logger.debug("Checked file path: '%s'", file_path)
        except KeyError:  # if there's no list provided
            return HttpResponseBadRequest()
        # prefix output to protect from JSON vulnerability (stripped by
        # Angular)
        return HttpResponse(")]}',\n" + json.dumps(bad_file_list),
                            content_type="application/json")
Пример #4
0
 def test_translate_from_relative_path_without_base_path(self):
     translate_file_source = \
         generate_file_source_translator(username=self.username)
     source = translate_file_source(self.rel_path_source)
     self.assertEqual(source, os.path.join(
         settings.REFINERY_DATA_IMPORT_DIR,
         self.username,
         self.rel_path_source))
Пример #5
0
    def post(self, request, *args, **kwargs):
        if not request.is_ajax() or not request.body:
            return HttpResponseBadRequest()

        try:
            file_data = json.loads(request.body)
        except ValueError:
            return HttpResponseBadRequest()
        try:
            input_file_list = file_data['list']
        except KeyError:
            return HttpResponseBadRequest()

        try:
            base_path = file_data['base_path']
        except KeyError:
            base_path = None
        try:
            identity_id = file_data['identity_id']
        except KeyError:
            identity_id = None

        bad_file_list = []
        translate_file_source = generate_file_source_translator(
            username=request.user.username,
            base_path=base_path,
            identity_id=identity_id)

        if settings.REFINERY_DEPLOYMENT_PLATFORM == 'aws':
            # get a list of all uploaded S3 objects for the user
            uploaded_s3_key_list = []
            s3 = boto3.resource('s3')
            s3_bucket = s3.Bucket(settings.MEDIA_BUCKET)
            for s3_object in s3_bucket.objects.filter(
                    Prefix='uploads/{}'.format(identity_id)):
                uploaded_s3_key_list.append(s3_object.key)

        for input_file_path in input_file_list:
            if not isinstance(input_file_path, unicode):
                bad_file_list.append(input_file_path)
            else:
                input_file_path = translate_file_source(input_file_path)
                if settings.REFINERY_DEPLOYMENT_PLATFORM == 'aws':
                    # check if S3 object key exists
                    bucket_name, key = parse_s3_url(input_file_path)
                    if key not in uploaded_s3_key_list:
                        bad_file_list.append(os.path.basename(key))
                else:  # POSIX file system
                    if not os.path.exists(input_file_path):
                        bad_file_list.append(input_file_path)
            logger.debug("Checked file path: '%s'", input_file_path)

        # prefix output to protect from JSON vulnerability (stripped by
        # Angular)
        return HttpResponse(")]}',\n" + json.dumps(bad_file_list),
                            content_type="application/json")
def process_metadata_table(username, title, metadata_file, source_columns,
                           data_file_column, auxiliary_file_column=None,
                           base_path="", data_file_permanent=False,
                           species_column=None, genome_build_column=None,
                           annotation_column=None, sample_column=None,
                           assay_column=None, slug=None, is_public=False):
    """Create a dataset given a metadata file object and its description
    :param username: username
    :type username: str
    :param title: dataset name
    :type title: str
    :param metadata_file: metadata file in tab-delimited format
    :type metadata_file: file
    :param source_columns: a list of source column indices
    :type source_columns: list of ints
    :param data_file_column: data file column index
    :type data_file_column: int
    :param data_file_permanent: should data files be imported
    :type data_file_permanent: bool
    :param base_path: path to append to data file
    :type base_path: str
    :param auxiliary_file_column: auxiliary file column index
    :type auxiliary_file_column: int
    :param species_column: species column index
    :type species_column: int
    :param genome_build_column: genome build column index
    :type genome_build_column: int
    :param annotation_column: annotation column index
    :type annotation_column: int
    :param slug: dataset name shortcut
    :type slug: str
    :param is_public: is dataset available to public
    :type is_public: bool
    :returns: UUID of the new dataset
    """
    try:
        source_columns = [abs(int(x)) for x in source_columns]
    except ValueError as exc:
        logger.error(exc)
        raise ValueError("source column indices must be integers")
    try:
        data_file_column = int(data_file_column)
    except ValueError as exc:
        logger.error(exc)
        raise ValueError("data file column index must be an integer")
    try:
        auxiliary_file_column = int(auxiliary_file_column)
    except (TypeError, ValueError):
        auxiliary_file_column = None
    try:
        base_path = str(base_path)
    except ValueError:
        base_path = ""
    try:
        species_column = int(species_column)
    except (TypeError, ValueError):
        species_column = None
    try:
        genome_build_column = int(genome_build_column)
    except (TypeError, ValueError):
        genome_build_column = None
    try:
        annotation_column = int(annotation_column)
    except (TypeError, ValueError):
        annotation_column = None
    try:
        sample_column = int(sample_column)
    except (TypeError, ValueError):
        sample_column = None
    try:
        assay_column = int(assay_column)
    except (TypeError, ValueError):
        assay_column = None
    try:
        slug = str(slug)
    except ValueError:
        slug = None
    data_file_permanent = bool(data_file_permanent)
    is_public = bool(is_public)
    file_source_translator = generate_file_source_translator(
        username=username, base_path=base_path)
    parser = SingleFileColumnParser(
        metadata_file=metadata_file,
        file_source_translator=file_source_translator,
        source_column_index=source_columns,
        data_file_column_index=data_file_column,
        auxiliary_file_column_index=auxiliary_file_column,
        file_base_path=base_path, data_file_permanent=data_file_permanent,
        species_column_index=species_column,
        genome_build_column_index=genome_build_column,
        annotation_column_index=annotation_column,
        sample_column_index=sample_column, assay_column_index=assay_column,
        column_index_separator="/")
    investigation = parser.run()
    investigation.title = title
    investigation.save()

    return create_dataset(
        investigation_uuid=investigation.uuid, username=username,
        dataset_name=title, slug=slug, public=is_public)
Пример #7
0
 def test_translate_from_relative_path_with_base_bath(self):
     translate_file_source = \
         generate_file_source_translator(base_path=self.base_path)
     source = translate_file_source(self.rel_path_source)
     self.assertEqual(source,
                      os.path.join(self.base_path, self.rel_path_source))
Пример #8
0
 def test_translate_from_absolute_path(self):
     translate_file_source = generate_file_source_translator()
     source = translate_file_source(self.abs_path_source)
     self.assertEqual(source, self.abs_path_source)
Пример #9
0
 def test_translate_from_url(self):
     translate_file_source = generate_file_source_translator()
     source = translate_file_source(self.url_source)
     self.assertEqual(source, self.url_source)
Пример #10
0
 def test_translate_with_map(self):
     settings.REFINERY_FILE_SOURCE_MAP = {self.url_prefix: '/new/path/'}
     translate_file_source = generate_file_source_translator()
     source = translate_file_source(self.url_source)
     self.assertEqual(source, os.path.join('/new/path/', self.filename))
Пример #11
0
    def post(self, request, *args, **kwargs):
        existing_data_set_uuid = request.GET.get('data_set_uuid')
        existing_datafile_names = []
        if existing_data_set_uuid:
            data_set = get_object_or_404(DataSet, uuid=existing_data_set_uuid)
            investigation = data_set.get_investigation()
            existing_datafile_names = investigation.get_datafile_names(
                local_only=True, exclude_metadata_file=True
            )

        if not request.is_ajax() or not request.body:
            return HttpResponseBadRequest()

        try:
            file_data = json.loads(request.body)
        except ValueError:
            return HttpResponseBadRequest()
        try:
            input_file_list = file_data['list']
        except KeyError:
            return HttpResponseBadRequest()

        try:
            base_path = file_data['base_path']
        except KeyError:
            base_path = None
        try:
            identity_id = file_data['identity_id']
        except KeyError:
            identity_id = None

        bad_file_list = []
        translate_file_source = generate_file_source_translator(
            username=request.user.username, base_path=base_path,
            identity_id=identity_id
        )

        uploaded_s3_key_list = []
        if settings.REFINERY_DEPLOYMENT_PLATFORM == 'aws':
            # get a list of all uploaded S3 objects for the user
            s3 = boto3.resource('s3')
            s3_bucket = s3.Bucket(settings.UPLOAD_BUCKET)
            # TODO: handle ParamValidationError (return error msg in response?)
            for s3_object in s3_bucket.objects.filter(Prefix=identity_id):
                uploaded_s3_key_list.append(s3_object.key)

        for input_file_path in input_file_list:
            if not isinstance(input_file_path, unicode):
                bad_file_list.append(input_file_path)
                logger.error("Uploaded file path '%s' is not a string",
                             input_file_path)
            else:
                input_file_path = translate_file_source(input_file_path)
                if settings.REFINERY_DEPLOYMENT_PLATFORM == 'aws':
                    # check if S3 object key exists
                    bucket_name, key = parse_s3_url(input_file_path)
                    if key not in uploaded_s3_key_list:
                        bad_file_list.append(os.path.basename(key))
                        logger.debug("Object key '%s' does not exist in '%s'",
                                     key, bucket_name)
                    else:
                        logger.debug("Object key '%s' exists in '%s'",
                                     key, bucket_name)
                else:  # POSIX file system
                    if not os.path.exists(input_file_path):
                        bad_file_list.append(os.path.basename(input_file_path))
                        logger.debug(
                            "File '%s' does not exist", input_file_path
                        )
                    else:
                        logger.debug("File '%s' exists", input_file_path)

        response_data = {
            "data_files_not_uploaded": [
                file_name for file_name in bad_file_list
                if file_name not in existing_datafile_names
            ],
            "data_files_to_be_deleted": [
                file_name for file_name in existing_datafile_names
                if file_name not in bad_file_list
            ]
        }
        return JsonResponse(response_data)
def parse_isatab(username,
                 public,
                 path,
                 identity_id=None,
                 additional_raw_data_file_extension=None,
                 isa_archive=None,
                 pre_isa_archive=None,
                 file_base_path=None,
                 overwrite=False):
    """parses in an ISA-TAB file to create database entries and creates or
    updates a dataset for the investigation to belong to; returns the dataset
    UUID or None if something went wrong. Use like this: parse_isatab(username,
    is_public, folder_name, additional_raw_data_file_extension,
    isa_archive=<path>, pre_isa_archive=<path>, file_base_path=<path>
    Parameters:
    username: username of the person the dataset will belong to
    public: boolean that determines if the dataset is public or not
    path: absolute path of the ISA-Tab file to parse
    additional_raw_data_file_extension: an optional argument that will append a
    suffix to items in Raw Data File as need be
    isa_archive: if you're passing a directory, a zipped version of the
    directory for storage and legacy purposes
    pre_isa_archive: optional copy of files that were converted to ISA-Tab
    file_base_path: if your file locations are relative paths, this is the base
    """
    file_source_translator = generate_file_source_translator(
        username=username, base_path=file_base_path, identity_id=identity_id)
    parser = IsaTabParser(
        file_source_translator=file_source_translator,
        additional_raw_data_file_extension=additional_raw_data_file_extension,
    )
    """Get the study title and investigation id and see if anything is in the
    database and if so compare the checksum
    """
    # 1. First check whether the user exists
    try:
        user = User.objects.get(username__exact=username)
    except (User.DoesNotExist, User.MultipleObjectsReturned):
        user = None
    # 2. If user exists we need to quickly get the dataset title to see if its
    # already in the DB
    if user:
        checksum = None
        (identifier, title) = parser.get_dataset_name(path)
        if identifier is None or title is None:
            datasets = []
        else:
            dataset_title = "%s: %s" % (identifier, title)
            datasets = DataSet.objects.filter(name=dataset_title)
        # check if the investigation already exists
        if len(datasets):  # if not 0, update dataset with new investigation
            # go through datasets until you find one with the correct owner
            for ds in datasets:
                own = ds.get_owner()
                if own == user:
                    if overwrite:
                        # Remove the existing data set first
                        checksum = False
                        ds.delete()
                    else:
                        # 3. Finally we need to get the checksum so that we can
                        # compare that to our given file.
                        investigation = ds.get_investigation()
                        fileStoreItem = FileStoreItem.objects.get(
                            uuid=investigation.isarchive_file)
                        if fileStoreItem:
                            try:
                                logger.info("Get file: %s",
                                            fileStoreItem.get_absolute_path())
                                checksum = calculate_checksum(
                                    fileStoreItem.get_file_object())
                            except IOError as exc:
                                logger.error(
                                    "Original ISA-tab archive wasn't found. "
                                    "Error: '%s'", exc)
        # 4. Finally if we got a checksum for an existing file, we calculate
        # the checksum for the new file and compare them
        if checksum:
            new_checksum = None
            # TODO: error handling
            with open(path, 'rb') as f:
                new_checksum = calculate_checksum(f)
            if checksum == new_checksum:
                # Checksums are identical so we can skip this file.
                logger.info("The checksum of both files is the same: %s",
                            checksum)
                return \
                    investigation.investigationlink_set.all()[0].data_set.uuid

    with transaction.atomic():
        investigation = parser.run(path,
                                   isa_archive=isa_archive,
                                   preisa_archive=pre_isa_archive)
        data_uuid = create_dataset(investigation.uuid, username, public=public)
        return data_uuid
Пример #13
0
def parse_isatab(username, public, path, identity_id=None,
                 additional_raw_data_file_extension=None, isa_archive=None,
                 pre_isa_archive=None, file_base_path=None, overwrite=False,
                 existing_data_set_uuid=None):
    """parses in an ISA-TAB file to create database entries and creates or
    updates a dataset for the investigation to belong to; returns the dataset
    UUID or None if something went wrong. Use like this: parse_isatab(username,
    is_public, folder_name, additional_raw_data_file_extension,
    isa_archive=<path>, pre_isa_archive=<path>, file_base_path=<path>
    Parameters:
    username: username of the person the dataset will belong to
    public: boolean that determines if the dataset is public or not
    path: absolute path of the ISA-Tab file to parse
    additional_raw_data_file_extension: an optional argument that will append a
    suffix to items in Raw Data File as need be
    isa_archive: if you're passing a directory, a zipped version of the
    directory for storage and legacy purposes
    pre_isa_archive: optional copy of files that were converted to ISA-Tab
    file_base_path: if your file locations are relative paths, this is the base
    existing_data_set_uuid: UUID of an existing DataSet that a metadata
    revision is to be performed upon
    """
    file_source_translator = generate_file_source_translator(
        username=username, base_path=file_base_path, identity_id=identity_id
    )
    parser = IsaTabParser(
        file_source_translator=file_source_translator,
        additional_raw_data_file_extension=additional_raw_data_file_extension,
    )
    """Get the study title and investigation id and see if anything is in the
    database and if so compare the checksum
    """
    # 1. First check whether the user exists
    try:
        user = User.objects.get(username__exact=username)
    except (User.DoesNotExist, User.MultipleObjectsReturned):
        user = None
    # 2. If user exists we need to quickly get the dataset title to see if its
    # already in the DB
    if user:
        checksum = None
        (identifier, title) = parser.get_dataset_name(path)
        if identifier is None or title is None:
            datasets = []
        else:
            dataset_title = "%s: %s" % (identifier, title)
            datasets = DataSet.objects.filter(name=dataset_title)
        # check if the investigation already exists
        # if not 0, update dataset with new investigation
        if len(datasets) and not existing_data_set_uuid:
            # go through datasets until you find one with the correct owner
            for ds in datasets:
                own = ds.get_owner()
                if own == user:
                    if overwrite:
                        # Remove the existing data set first
                        checksum = False
                        ds.delete()
                    else:
                        # 3. Finally we need to get the checksum so that we can
                        # compare that to our given file.
                        investigation = ds.get_investigation()
                        fileStoreItem = FileStoreItem.objects.get(
                            uuid=investigation.isarchive_file)
                        if fileStoreItem:
                            try:
                                logger.info("Get file: %s", fileStoreItem)
                                checksum = calculate_checksum(
                                    fileStoreItem.datafile
                                )
                            except IOError as exc:
                                logger.error(
                                    "Original ISA-tab archive wasn't found. "
                                    "Error: '%s'", exc
                                )
        # 4. Finally if we got a checksum for an existing file, we calculate
        # the checksum for the new file and compare them
        if checksum:
            new_checksum = None
            # TODO: error handling
            with open(path, 'rb') as f:
                new_checksum = calculate_checksum(f)
            if checksum == new_checksum:
                # Checksums are identical so we can skip this file.
                logger.info("The checksum of both files is the same: %s",
                            checksum)
                return \
                    investigation.investigationlink_set.all()[0].data_set.uuid

    with transaction.atomic():
        investigation = parser.run(
            path, isa_archive=isa_archive, preisa_archive=pre_isa_archive
        )
        if existing_data_set_uuid:
            data_set = DataSet.objects.get(uuid=existing_data_set_uuid)
            data_set.update_with_revised_investigation(investigation)
            return existing_data_set_uuid

        data_set_uuid = create_dataset(
            investigation.uuid, username, public=public
        )
        return data_set_uuid
Пример #14
0
 def test_translate_from_relative_path_without_username_or_base_path(self):
     translate_file_source = generate_file_source_translator()
     with self.assertRaises(ValueError):
         translate_file_source(self.rel_path_source)
def process_metadata_table(
    username,
    title,
    metadata_file,
    source_columns,
    data_file_column,
    auxiliary_file_column=None,
    base_path="",
    data_file_permanent=False,
    species_column=None,
    genome_build_column=None,
    annotation_column=None,
    sample_column=None,
    assay_column=None,
    is_public=False,
    delimiter="comma",
    custom_delimiter_string=",",
    identity_id=None,
    existing_data_set_uuid=None
):
    """Create a dataset given a metadata file object and its description
    :param username: username
    :type username: str
    :param title: dataset name
    :type title: str
    :param metadata_file: metadata file in tab-delimited format
    :type metadata_file: file
    :param source_columns: a list of source column indices
    :type source_columns: list of ints
    :param data_file_column: data file column index
    :type data_file_column: int
    :param data_file_permanent: should data files be imported
    :type data_file_permanent: bool
    :param base_path: path to append to data file
    :type base_path: str
    :param auxiliary_file_column: auxiliary file column index
    :type auxiliary_file_column: int
    :param species_column: species column index
    :type species_column: int
    :param genome_build_column: genome build column index
    :type genome_build_column: int
    :param annotation_column: annotation column index
    :type annotation_column: int
    :param is_public: is dataset available to public
    :type is_public: bool
    :param  existing_data_set_uuid: UUID of an existing DataSet that a
    metadata revision is to be performed upon
    :returns: UUID of the new dataset
    """
    try:
        source_columns = [abs(int(x)) for x in source_columns]
    except ValueError as exc:
        logger.error(exc)
        raise ValueError("source column indices must be integers")
    try:
        data_file_column = int(data_file_column)
    except ValueError as exc:
        logger.error(exc)
        raise ValueError("data file column index must be an integer")
    try:
        auxiliary_file_column = int(auxiliary_file_column)
    except (TypeError, ValueError):
        auxiliary_file_column = None
    try:
        base_path = str(base_path)
    except ValueError:
        base_path = ""
    try:
        species_column = int(species_column)
    except (TypeError, ValueError):
        species_column = None
    try:
        genome_build_column = int(genome_build_column)
    except (TypeError, ValueError):
        genome_build_column = None
    try:
        annotation_column = int(annotation_column)
    except (TypeError, ValueError):
        annotation_column = None
    try:
        sample_column = int(sample_column)
    except (TypeError, ValueError):
        sample_column = None
    try:
        assay_column = int(assay_column)
    except (TypeError, ValueError):
        assay_column = None
    try:
        delimiter = str(delimiter)
    except ValueError:
        delimiter = "comma"

    try:
        custom_delimiter_string = str(custom_delimiter_string)
    except ValueError:
        custom_delimiter_string = ","

    data_file_permanent = bool(data_file_permanent)
    is_public = bool(is_public)
    file_source_translator = generate_file_source_translator(
        username=username, base_path=base_path, identity_id=identity_id
    )

    # TODO: From here on should be run within a transaction as to not commit
    #  things to the db on an import failure, but doing so doesn't allow for
    #  the association of uploaded datafiles
    parser = SingleFileColumnParser(
        metadata_file=metadata_file,
        file_source_translator=file_source_translator,
        source_column_index=source_columns,
        data_file_column_index=data_file_column,
        auxiliary_file_column_index=auxiliary_file_column,
        file_base_path=base_path,
        data_file_permanent=data_file_permanent,
        species_column_index=species_column,
        genome_build_column_index=genome_build_column,
        annotation_column_index=annotation_column,
        sample_column_index=sample_column,
        assay_column_index=assay_column,
        column_index_separator="/",
        delimiter=delimiter,
        custom_delimiter_string=custom_delimiter_string
    )
    investigation = parser.run()
    investigation.title = title
    investigation.save()

    if existing_data_set_uuid:
        data_set = DataSet.objects.get(uuid=existing_data_set_uuid)
        data_set.update_with_revised_investigation(investigation)
        return existing_data_set_uuid
    return create_dataset(
        investigation_uuid=investigation.uuid, username=username,
        dataset_name=title, public=is_public
    )
Пример #16
0
def parse_isatab(username,
                 public,
                 path,
                 identity_id=None,
                 additional_raw_data_file_extension=None,
                 isa_archive=None,
                 pre_isa_archive=None,
                 file_base_path=None,
                 overwrite=False,
                 existing_data_set_uuid=None):
    """parses in an ISA-TAB file to create database entries and creates or
    updates a dataset for the investigation to belong to; returns the dataset
    UUID or None if something went wrong. Use like this: parse_isatab(username,
    is_public, folder_name, additional_raw_data_file_extension,
    isa_archive=<path>, pre_isa_archive=<path>, file_base_path=<path>
    Parameters:
    username: username of the person the dataset will belong to
    public: boolean that determines if the dataset is public or not
    path: absolute path of the ISA-Tab file to parse
    additional_raw_data_file_extension: an optional argument that will append a
    suffix to items in Raw Data File as need be
    isa_archive: if you're passing a directory, a zipped version of the
    directory for storage and legacy purposes
    pre_isa_archive: optional copy of files that were converted to ISA-Tab
    file_base_path: if your file locations are relative paths, this is the base
    existing_data_set_uuid: UUID of an existing DataSet that a metadata
    revision is to be performed upon
    """
    file_source_translator = generate_file_source_translator(
        username=username, base_path=file_base_path, identity_id=identity_id)
    parser = IsaTabParser(
        file_source_translator=file_source_translator,
        additional_raw_data_file_extension=additional_raw_data_file_extension,
    )
    """Get the study title and investigation id and see if anything is in the
    database and if so compare the checksum
    """
    # 1. First check whether the user exists
    try:
        user = User.objects.get(username__exact=username)
    except (User.DoesNotExist, User.MultipleObjectsReturned):
        user = None
    # 2. If user exists we need to quickly get the dataset title to see if its
    # already in the DB
    if user:
        checksum = None
        (identifier, title) = parser.get_dataset_name(path)
        if identifier is None or title is None:
            datasets = []
        else:
            dataset_title = "%s: %s" % (identifier, title)
            datasets = DataSet.objects.filter(name=dataset_title)
        # check if the investigation already exists
        # if not 0, update dataset with new investigation
        if len(datasets) and not existing_data_set_uuid:
            # go through datasets until you find one with the correct owner
            for ds in datasets:
                own = ds.get_owner()
                if own == user:
                    if overwrite:
                        # Remove the existing data set first
                        checksum = False
                        ds.delete()
                    else:
                        # 3. Finally we need to get the checksum so that we can
                        # compare that to our given file.
                        investigation = ds.get_investigation()
                        try:
                            """isaarchive_file should be a uuid foreign key
                            upon creation of either FileStoreItem or
                            Investigation in isa_tab_parser.py"""
                            file_store_item = FileStoreItem.objects.get(
                                uuid=investigation.isarchive_file)
                            logger.info("Get file: %s", file_store_item)
                        # will fail later on when the .datafile is accessed
                        except (FileStoreItem.DoesNotExist,
                                FileStoreItem.MultipleObjectsReturned) as e:
                            logger.error(
                                'Did not get FileStoreItem for uuid %s',
                                unicode(investigation.isarchive_file), e)

                        try:
                            checksum = calculate_checksum(
                                file_store_item.datafile)
                        except (EnvironmentError,
                                botocore.exceptions.BotoCoreError,
                                botocore.exceptions.ClientError) as exc:
                            logger.error(
                                "Original ISA-tab archive was not found: %s",
                                exc)
        # 4. Finally if we got a checksum for an existing file, we calculate
        # the checksum for the new file and compare them
        if checksum:
            new_checksum = None
            # TODO: error handling
            with open(path, 'rb') as f:
                new_checksum = calculate_checksum(f)
            if checksum == new_checksum:
                # Checksums are identical so we can skip this file.
                logger.info("The checksum of both files is the same: %s",
                            checksum)
                return \
                    investigation.investigationlink_set.all()[0].data_set.uuid

    with transaction.atomic():
        investigation = parser.run(path,
                                   isa_archive=isa_archive,
                                   preisa_archive=pre_isa_archive)
        if existing_data_set_uuid:
            try:
                data_set = DataSet.objects.get(uuid=existing_data_set_uuid)
            except (DataSet.DoesNotExist,
                    DataSet.MultipleObjectsReturned) as e:
                logger.error(
                    'DataSet for uuid %s not fetched and thus not '
                    'updated with revised investigation %s: %s',
                    existing_data_set_uuid, unicode(investigation), e)
                raise type(e)(
                    'DataSet for uuid %s not fetched and thus not '
                    'updated with revised investigation {}: {}'.format(
                        existing_data_set_uuid, unicode(investigation)))
            else:
                data_set.update_with_revised_investigation(investigation)
                return existing_data_set_uuid

        data_set_uuid = create_dataset(investigation.uuid,
                                       username,
                                       public=public)
        return data_set_uuid
Пример #17
0
 def parse(self, dir_name):
     file_source_translator = generate_file_source_translator(
         username=self.user.username)
     dir = os.path.join(TEST_DATA_BASE_PATH, dir_name)
     return IsaTabParser(
         file_source_translator=file_source_translator).run(dir)
Пример #18
0
def process_metadata_table(username,
                           title,
                           metadata_file,
                           source_columns,
                           data_file_column,
                           auxiliary_file_column=None,
                           base_path="",
                           data_file_permanent=False,
                           species_column=None,
                           genome_build_column=None,
                           annotation_column=None,
                           sample_column=None,
                           assay_column=None,
                           slug=None,
                           is_public=False,
                           delimiter="comma",
                           custom_delimiter_string=",",
                           identity_id=None):
    """Create a dataset given a metadata file object and its description
    :param username: username
    :type username: str
    :param title: dataset name
    :type title: str
    :param metadata_file: metadata file in tab-delimited format
    :type metadata_file: file
    :param source_columns: a list of source column indices
    :type source_columns: list of ints
    :param data_file_column: data file column index
    :type data_file_column: int
    :param data_file_permanent: should data files be imported
    :type data_file_permanent: bool
    :param base_path: path to append to data file
    :type base_path: str
    :param auxiliary_file_column: auxiliary file column index
    :type auxiliary_file_column: int
    :param species_column: species column index
    :type species_column: int
    :param genome_build_column: genome build column index
    :type genome_build_column: int
    :param annotation_column: annotation column index
    :type annotation_column: int
    :param slug: dataset name shortcut
    :type slug: str
    :param is_public: is dataset available to public
    :type is_public: bool
    :returns: UUID of the new dataset
    """
    try:
        source_columns = [abs(int(x)) for x in source_columns]
    except ValueError as exc:
        logger.error(exc)
        raise ValueError("source column indices must be integers")
    try:
        data_file_column = int(data_file_column)
    except ValueError as exc:
        logger.error(exc)
        raise ValueError("data file column index must be an integer")
    try:
        auxiliary_file_column = int(auxiliary_file_column)
    except (TypeError, ValueError):
        auxiliary_file_column = None
    try:
        base_path = str(base_path)
    except ValueError:
        base_path = ""
    try:
        species_column = int(species_column)
    except (TypeError, ValueError):
        species_column = None
    try:
        genome_build_column = int(genome_build_column)
    except (TypeError, ValueError):
        genome_build_column = None
    try:
        annotation_column = int(annotation_column)
    except (TypeError, ValueError):
        annotation_column = None
    try:
        sample_column = int(sample_column)
    except (TypeError, ValueError):
        sample_column = None
    try:
        assay_column = int(assay_column)
    except (TypeError, ValueError):
        assay_column = None
    try:
        if slug:
            slug = str(slug)
    except ValueError:
        slug = None

    try:
        delimiter = str(delimiter)
    except ValueError:
        delimiter = "comma"

    try:
        custom_delimiter_string = str(custom_delimiter_string)
    except ValueError:
        custom_delimiter_string = ","

    data_file_permanent = bool(data_file_permanent)
    is_public = bool(is_public)
    file_source_translator = generate_file_source_translator(
        username=username, base_path=base_path, identity_id=identity_id)

    parser = SingleFileColumnParser(
        metadata_file=metadata_file,
        file_source_translator=file_source_translator,
        source_column_index=source_columns,
        data_file_column_index=data_file_column,
        auxiliary_file_column_index=auxiliary_file_column,
        file_base_path=base_path,
        data_file_permanent=data_file_permanent,
        species_column_index=species_column,
        genome_build_column_index=genome_build_column,
        annotation_column_index=annotation_column,
        sample_column_index=sample_column,
        assay_column_index=assay_column,
        column_index_separator="/",
        delimiter=delimiter,
        custom_delimiter_string=custom_delimiter_string)

    investigation = parser.run()
    investigation.title = title
    investigation.save()

    return create_dataset(investigation_uuid=investigation.uuid,
                          username=username,
                          dataset_name=title,
                          slug=slug,
                          public=is_public)
Пример #19
0
def process_metadata_table(username,
                           title,
                           metadata_file,
                           source_columns,
                           data_file_column,
                           auxiliary_file_column=None,
                           base_path="",
                           data_file_permanent=False,
                           species_column=None,
                           genome_build_column=None,
                           annotation_column=None,
                           sample_column=None,
                           assay_column=None,
                           is_public=False,
                           delimiter="comma",
                           custom_delimiter_string=",",
                           identity_id=None,
                           existing_data_set_uuid=None):
    """Create a dataset given a metadata file object and its description
    :param username: username
    :type username: str
    :param title: dataset name
    :type title: str
    :param metadata_file: metadata file in tab-delimited format
    :type metadata_file: file
    :param source_columns: a list of source column indices
    :type source_columns: list of ints
    :param data_file_column: data file column index
    :type data_file_column: int
    :param data_file_permanent: should data files be imported
    :type data_file_permanent: bool
    :param base_path: path to append to data file
    :type base_path: str
    :param auxiliary_file_column: auxiliary file column index
    :type auxiliary_file_column: int
    :param species_column: species column index
    :type species_column: int
    :param genome_build_column: genome build column index
    :type genome_build_column: int
    :param annotation_column: annotation column index
    :type annotation_column: int
    :param is_public: is dataset available to public
    :type is_public: bool
    :param  existing_data_set_uuid: UUID of an existing DataSet that a
    metadata revision is to be performed upon
    :returns: UUID of the new dataset
    """
    try:
        source_columns = [abs(int(x)) for x in source_columns]
    except ValueError as exc:
        logger.error(exc)
        raise ValueError("source column indices must be integers")
    try:
        data_file_column = int(data_file_column)
    except ValueError as exc:
        logger.error(exc)
        raise ValueError("data file column index must be an integer")
    try:
        auxiliary_file_column = int(auxiliary_file_column)
    except (TypeError, ValueError):
        auxiliary_file_column = None
    try:
        base_path = str(base_path)
    except ValueError:
        base_path = ""
    try:
        species_column = int(species_column)
    except (TypeError, ValueError):
        species_column = None
    try:
        genome_build_column = int(genome_build_column)
    except (TypeError, ValueError):
        genome_build_column = None
    try:
        annotation_column = int(annotation_column)
    except (TypeError, ValueError):
        annotation_column = None
    try:
        sample_column = int(sample_column)
    except (TypeError, ValueError):
        sample_column = None
    try:
        assay_column = int(assay_column)
    except (TypeError, ValueError):
        assay_column = None
    try:
        delimiter = str(delimiter)
    except ValueError:
        delimiter = "comma"

    try:
        custom_delimiter_string = str(custom_delimiter_string)
    except ValueError:
        custom_delimiter_string = ","

    data_file_permanent = bool(data_file_permanent)
    is_public = bool(is_public)
    file_source_translator = generate_file_source_translator(
        username=username, base_path=base_path, identity_id=identity_id)

    # TODO: From here on should be run within a transaction as to not commit
    #  things to the db on an import failure, but doing so doesn't allow for
    #  the association of uploaded datafiles
    parser = SingleFileColumnParser(
        metadata_file=metadata_file,
        file_source_translator=file_source_translator,
        source_column_index=source_columns,
        data_file_column_index=data_file_column,
        auxiliary_file_column_index=auxiliary_file_column,
        file_base_path=base_path,
        data_file_permanent=data_file_permanent,
        species_column_index=species_column,
        genome_build_column_index=genome_build_column,
        annotation_column_index=annotation_column,
        sample_column_index=sample_column,
        assay_column_index=assay_column,
        column_index_separator="/",
        delimiter=delimiter,
        custom_delimiter_string=custom_delimiter_string)
    investigation = parser.run()
    investigation.title = title
    investigation.save()

    if existing_data_set_uuid:
        try:
            data_set = DataSet.objects.get(uuid=existing_data_set_uuid)
        except (DataSet.DoesNotExist, DataSet.MultipleObjectsReturned) as e:
            logger.error(
                'DataSet for uuid %s not fetched and thus not '
                'updated with revised investigation %s: %s',
                existing_data_set_uuid, unicode(investigation), e)
            raise type(e)('DataSet for uuid {} not fetched and thus not '
                          'updated with revised investigation {}'.format(
                              existing_data_set_uuid, unicode(investigation)))
        else:
            data_set.update_with_revised_investigation(investigation)
            return existing_data_set_uuid

    return create_dataset(investigation_uuid=investigation.uuid,
                          username=username,
                          dataset_name=title,
                          public=is_public)