示例#1
0
def get_upload_information(request):
    submission_id = request.GET.get('submission_id')

    # tonietuk's intercept starts
    if not submission_id:
        data = {'found': False}
        return HttpResponse(json.dumps(data))
    # tonietuk's intercept ends

    # get submission collection and check status
    sub = Submission().get_record(submission_id)
    if sub:
        if not sub['complete'] or sub['complete'] == 'false':
            rem = RemoteDataFile().get_by_sub_id(submission_id)
            if rem:
                speeds = rem['transfer_rate'][-100:]
                complete = rem['pct_completed']
                data = {'speeds': speeds, 'complete': complete, 'finished': False, 'found': True}
                return HttpResponse(json.dumps(data))
        else:
            # elapsed = str(parser.parse(sub['completed_on']) - parser.parse(sub['commenced_on']))
            # data = {'upload_time': str(elapsed), 'completed_on': sub['completed_on'], 'article_id': sub.get('article_id'), 'finished': True, 'found': True}
            data = {'sub_id': str(sub['_id']), 'status': sub['status'], 'accessions': sub['accessions'],
                    'repo': sub['repository'], 'completed_on': sub['completed_on'].strftime("%Y-%m-%d %H:%M:%S"),
                    'article_id': sub.get('article_id'), 'finished': True, 'found': True}
            return HttpResponse(json.dumps(data))

    data = {'found': False}
    return HttpResponse(json.dumps(data))
示例#2
0
def resolve_submission_id(request, submission_id):
    sub = Submission().get_record(submission_id)
    # get all file metadata
    output = dict()
    files = list()
    for f in sub.get("bundle", list()):
        file = DataFile().get_record(f)
        files.append(file["description"]["attributes"])
    output["files"] = files
    output["accessions"] = sub["accessions"]
    output["metadata"] = {}
    output["metadata"]["dc"] = sub["meta"]["fields"]
    return HttpResponse(j.dumps(output))
示例#3
0
    def _do_file_transfer(self):
        submission_record = Submission().get_record(self.submission_id)

        # do we have files to be uploaded?
        bundle_df = pd.DataFrame(submission_record.get("bundle_meta", list()))

        if len(bundle_df
               ) == 0:  # insufficient information to proceed - no bundle meta
            return

        pending_df = bundle_df[bundle_df['upload_status'] == False]

        if len(pending_df) > 0:
            path2library = os.path.join(
                BASE_DIR, REPOSITORIES['ASPERA']['resource_path'])

            user_name = REPOSITORIES['ASPERA']['user_token']
            password = REPOSITORIES['ASPERA']['password']

            # compose remote file directory
            remote_path = d_utils.get_ena_remote_path(self.submission_id)

            self._do_aspera_transfer(user_name=user_name,
                                     password=password,
                                     remote_path=remote_path,
                                     file_path=list(pending_df['file_path']),
                                     path2library=path2library)
        else:
            # no files to be uploaded
            transfer_fields = dict()
            transfer_fields["transfer_status"] = "completed"
            transfer_fields["pct_completed"] = '100'
            transfer_fields["current_time"] = datetime.now().strftime(
                "%d-%m-%Y %H:%M:%S")

            # save collected metadata to the transfer record
            RemoteDataFile().update_transfer(self.transfer_token,
                                             transfer_fields)

            self.context["ena_status"] = "files_transferred"

        return
示例#4
0
def get_upload_information(request):
    context = dict()

    ids = json.loads(request.POST.get("ids", "[]"))
    sub_info_list = list()

    submission_queue_handle = ghlper.get_submission_queue_handle()

    for id in ids:
        # get submission record and check submission status
        try:
            sub = Submission().get_record(id)
        except:
            sub = dict()

        if not sub:
            continue

        sub_info_dict = dict()
        sub_info_dict["submission_id"] = id
        sub_info_dict["enable_submit_button"] = True

        repo = sub.get("repository", str()).lower()

        if repo in ["cg_core", "dataverse", "dspace", "ckan"]:
            if "meta" in sub and "fields" in sub[
                    "meta"] or "identifier" in sub["meta"]:
                pass
            else:
                sub_info_dict["enable_submit_button"] = False

        if str(sub.get("complete", False)).lower() == 'true':
            # submission has finished
            sub_info_dict["submission_status"] = True
            sub_info_dict["completed_on"] = sub.get(
                'completed_on', str()).strftime('%d %b, %Y, %H:%M') if sub.get(
                    'completed_on', str()) else 'unavailable'
            try:
                sub_info_dict["article_id"] = sub['article_id']
            except:
                sub_info_dict["article_id"] = "unavailable"

            # get study embargo info
            if repo == "ena":
                # get study accession
                prj = sub.get('accessions', dict()).get('project', [{}])
                status = prj[0].get("status", "Unknown")
                release_date = prj[0].get("release_date", str())
                if status.upper() == "PRIVATE":
                    sub_info_dict["release_status"] = "PRIVATE"

                    sub_info_dict["release_date"] = release_date
                    if len(release_date) >= 10:  # e.g. '2019-08-30'
                        try:
                            datetime_object = datetime.strptime(
                                release_date[:10], '%Y-%m-%d')
                            sub_info_dict["release_date"] = time.strftime(
                                '%a, %d %b %Y %H:%M',
                                datetime_object.timetuple())
                        except:
                            pass

                    sub_info_dict["release_message"] = "<div>All objects in this " \
                                                       "submission are set to " \
                                                       "private (confidential) status.</div>" \
                                                       "<div style='margin-top:10px;'>The release date is set for " \
                                                       "" + sub_info_dict["release_date"] + \
                                                       ".</div><div style='margin-top:10px;'>" \
                                                       "To release this study to the public, " \
                                                       "click the release study button.</div>"
                elif status.upper() == "PUBLIC":
                    sub_info_dict["release_status"] = "PUBLIC"
                    sub_info_dict[
                        "study_view_url"] = "https://www.ebi.ac.uk/ena/data/view/" + prj[
                            0].get("accession", str())
                    sub_info_dict["release_message"] = "<div>All objects in " \
                                                       "this submission are set to public status.</div> " \
                                                       "<div style='margin-top:10px;'>To view this study " \
                                                       "on the ENA browser (opens in a new browser tab), " \
                                                       "click the view on ENA button.</div>"
                else:
                    sub_info_dict["release_status"] = "Unknown"
                    sub_info_dict["release_message"] = "<div>The embargo status of " \
                                                       "this study is unknown.</div>" \
                                                       "<div>For more details, please contact your administrator. " \
                                                       "Alternatively, you can try searching for the study on the " \
                                                       "ENA browser to verify its status.</div>"
        else:
            sub_info_dict["is_active_submission"] = False
            if repo == "ena":  # this will be extended to other repositories/submission end-points
                submission_in_queue = submission_queue_handle.find_one(
                    {"submission_id": sub_info_dict["submission_id"]})
                if submission_in_queue:  # submission not queued, flag up to enable resubmission
                    sub_info_dict["is_active_submission"] = True

            # get status report
            status = sub.get("transcript", dict()).get('status', dict())
            if status:
                # status types are either 'info' or 'error'
                sub_info_dict["submission_report"] = dict(
                    type=status.get('type', str()),
                    message=status.get('message', str()))

            # report on submitted datafiles - ENA for now...
            if repo == "ena":
                run_accessions = sub.get('accessions',
                                         dict()).get('run', list())
                submitted_files = [
                    x for y in run_accessions
                    for x in y.get('datafiles', list())
                ]

                if submitted_files:
                    sub_info_dict["submitted_files"] = submitted_files

        sub_info_list.append(sub_info_dict)

    context["submission_information"] = sub_info_list
    out = jsonpickle.encode(context)
    return HttpResponse(out, content_type='application/json')
示例#5
0
class DataverseSubmit(object):
    def __init__(self, submission_id=str()):
        self.submission_id = submission_id

        self.submission_record = dict()
        self.file_path = str()
        self.host = str()
        self.api_token = str()
        self.headers = dict()

        if self.submission_id:
            # get submission record
            self.submission_record = Submission().get_record(
                self.submission_id)

            # set up submission parameters...

            # submission path
            dir = os.path.join(os.path.dirname(__file__), "data")
            self.file_path = os.path.join(
                os.path.join(dir, self.submission_id), 'dataverse')

            # dataverse host
            self.host = self.submission_record.get("destination_repo",
                                                   dict()).get("url", str())

            # api_token
            self.api_token = self.submission_record.get(
                "destination_repo", dict()).get("apikey", str())

            # headers
            self.headers = {'X-Dataverse-key': self.api_token}

    def submit(self):
        """
        function controls the submission of objects to a Dataverse
        :return:
        """

        sub_meta = self.submission_record.get("meta", dict())

        # if dataset id in submission meta, we are adding to existing dataset, otherwise
        #  we are creating a new dataset
        if "fields" in sub_meta:
            return self._create_and_add_to_dataverse()
        elif ('entity_id' in sub_meta
              and 'alias' in sub_meta) or ('dataverse_alias' in sub_meta
                                           and 'doi' in sub_meta):
            return self._add_to_dataverse()

    def truncate_url(self, url):
        if url.startswith('https://'):
            url = url[8:]
        elif url.startswith('http://'):
            url = url[7:]
        return url

    @staticmethod
    def get_format_doi(doi):
        """
        function formats passed doi for api calls to dataverse
        :param doi:
        :return:
        """
        doi_prefixes = [
            "https://doi.org/", "http://doi.org/", "https://", "http://",
            "doi.org/"
        ]

        for dp in doi_prefixes:
            if dp in doi:
                doi = "doi:" + doi.split("https://doi.org/")[-1]

        return doi

    def clear_submission_metadata(self):
        Submission().clear_submission_metadata(self.submission_id)

    def get_dataverse_details(self, dataverse_alias):
        """
        function retrieves dataverse details given its alias
        :param dataverse_alias:
        :return:
        """

        response_data = dict()

        try:
            url = self.host + "/api/dataverses/" + dataverse_alias
            response = requests.get(url)
            if str(response.status_code).lower() in ("ok", "200"):
                response_data = response.json().get("data", dict())
        except Exception as e:
            exception_message = "Error retrieving dataverse details " + url + " : " + str(
                e)
            self.report_error(exception_message)

        return response_data

    def get_dataset_details(self, doi):
        """
        function retrieves dataset details given its doi
        :param doi:
        :return:
        """

        response_data = dict()

        # retrieve dataset details given its doi
        headers = {'X-Dataverse-key': self.api_token}

        # get formatted doi
        doi = self.get_format_doi(doi)

        params = (('persistentId', doi), )

        try:
            url = self.host + "/api/datasets/:persistentId/"
            response = requests.get(url, headers=headers, params=params)
            if str(response.status_code).lower() in ("ok", "200"):
                response_data = response.json().get("data", dict())
        except Exception as e:
            exception_message = "Error retrieving dataset details " + url + " : " + str(
                e)
            self.report_error(exception_message)

        return response_data

    def _add_to_dataverse(self):
        """
        function adds datafiles to a dataset
        :return:
        """
        sub = self.submission_record

        # check for dataverse alias

        alias = sub.get("meta", dict()).get(
            "dataverse_alias", str()) or sub.get("meta", dict()).get(
                "alias", str())

        if not alias:
            return {"status": 404, "message": "\n Error getting dataverse"}

        # check for dataset doi
        doi = sub.get("meta", dict()).get("doi", str())

        if not doi:
            return {"status": 404, "message": "\n Error getting dataset"}

        # add file to dataset
        result = self.send_files_curl(persistent_id=doi)

        if result is True:
            # store accessions and clear submission
            dv_response_data = self.get_dataverse_details(alias)
            ds_response_data = self.get_dataset_details(doi)

            dataset_title = [
                x["value"] for x in ds_response_data.get(
                    "latestVersion", dict()).get("metadataBlocks", dict()).get(
                        "citation", dict()).get("fields", dict())
                if x.get("typeName", str()) == "title"
            ]

            acc = dict()
            acc['dataset_id'] = ds_response_data.get("id", str())
            acc['dataset_doi'] = doi
            acc['dataverse_alias'] = alias
            acc['dataverse_title'] = dv_response_data.get("name", "N/A")
            acc['dataset_title'] = "N/A"

            if dataset_title:
                if isinstance(dataset_title, list):
                    acc['dataset_title'] = dataset_title[0]
                elif isinstance(dataset_title, str):
                    acc['dataset_title'] = dataset_title

            sub['accessions'] = acc
            sub['target_id'] = sub.pop('_id', self.submission_id)
            Submission().save_record(dict(), **sub)

            self.clear_submission_metadata()

        return result

    def _create_and_add_to_dataverse(self):
        """
        creates a Dataset in a Dataverse
        :param submission_record:
        :return:
        """

        # proceed with the creation of a dataset iff no accessions are recorded
        dataset_persistent_id = self.submission_record.get(
            "accessions", dict()).get("dataset_doi", str())

        # there's existing submission associated with this submission
        if dataset_persistent_id:
            return self.post_dataset_creation(
                persistent_id=dataset_persistent_id)

        # get dataverse alias
        dataverse_alias = self.submission_record.get("meta", dict()).get(
            "alias", str())

        if not dataverse_alias:
            exception_message = 'Dataverse alias not found! '
            self.report_error(exception_message)
            return exception_message
            # raise OperationFailedError(exception_message)

        # convert to Dataset metadata
        metadata_file_path = self.do_conversion()

        # make API call
        api_call = 'curl -H "X-Dataverse-key: {api_token}" -X POST ' \
                   '{server_url}/api/dataverses/{dv_alias}/datasets --upload-file {dataset_json}'

        api_call = api_call.format(api_token=self.api_token,
                                   server_url=self.host,
                                   dv_alias=dataverse_alias,
                                   dataset_json=metadata_file_path)

        # retrieve call result
        try:
            receipt = subprocess.check_output(api_call, shell=True)
            receipt = json.loads(receipt.decode('utf-8'))
        except Exception as e:
            exception_message = 'API call error: ' + str(e)
            self.report_error(exception_message)
            return exception_message
            # raise OperationFailedError(exception_message)
        else:
            if receipt.get("status", str()).lower() in ("ok", "200"):
                receipt = receipt.get("data", dict())
            else:
                exception_message = 'The Dataset could not be created. ' + str(
                    receipt)
                self.report_error(exception_message)
                return exception_message
                # raise OperationFailedError(exception_message)

        dataset_persistent_id = receipt.get("persistentId", str())
        dataset_id = receipt.get("id", str())

        # retrieve and store accessions to db
        sub = self.submission_record
        acc = dict()
        acc['dataset_id'] = dataset_id
        acc['dataset_doi'] = dataset_persistent_id
        acc['dataverse_alias'] = dataverse_alias
        acc['dataset_title'] = "N/A"

        # retrieve dataverse details given its alias
        dv_response_data = self.get_dataverse_details(dataverse_alias)
        acc['dataverse_title'] = dv_response_data.get("name", "N/A")

        # retrieve dataset details given its doi
        ds_response_data = self.get_dataset_details(dataset_persistent_id)
        dataset_title = [
            x["value"]
            for x in ds_response_data.get("latestVersion", dict()).get(
                "metadataBlocks", dict()).get("citation", dict()).get(
                    "fields", dict()) if x.get("typeName", str()) == "title"
        ]

        if dataset_title:
            if isinstance(dataset_title, list):
                acc['dataset_title'] = dataset_title[0]
            elif isinstance(dataset_title, str):
                acc['dataset_title'] = dataset_title

        # update submission record with accessions
        sub['accessions'] = acc
        sub['target_id'] = sub.pop('_id', self.submission_id)
        Submission().save_record(dict(), **sub)

        # do post creation tasks
        return self.post_dataset_creation(persistent_id=dataset_persistent_id)

    def post_dataset_creation(self, persistent_id=str()):
        """
        upon completion of dataset creation, perform this task(s)
        :param persistent_id:
        :return:
        """
        # add file to dataset
        result = self.send_files_curl(persistent_id=persistent_id)

        if result is True:
            self.clear_submission_metadata()

        return result

    def send_files(self, sub, ds):

        for id in sub['bundle']:
            file = DataFile().get_record(ObjectId(id))
            file_location = file['file_location']
            file_name = file['name']
            with open(file_location, 'rb') as f:
                contents = f.read()
                ds.upload_file(file_name, contents, zip_files=False)

    def send_files_curl(self, persistent_id=str()):
        """
        function uses curl to add datafiles to a Dataverse dataset, given its persistent_id (DOI)
        :param persistent_id:
        :return:
        """

        # get submission record
        sub = self.submission_record

        # get formatted doi
        persistent_id = self.get_format_doi(persistent_id)

        datafiles = sub.get("bundle_meta", list())

        # get all pending files
        pending_files = [
            x for x in datafiles if x.get("upload_status", False) is False
        ]

        if not pending_files:  # update status and exit method
            if sub.get("complete", False) is False:
                sub['complete'] = True
                sub['completed_on'] = datetime.now()
                sub['target_id'] = sub.pop('_id', self.submission_id)
                Submission().save_record(dict(), **sub)

            return True

        # compose api call
        api_call = 'curl -H "X-Dataverse-key:{api_token}" -X ' \
                   'POST -F \'file=@{data_file}\' -F \'jsonData={{"description":"Datafile","categories":["Data"], ' \
                   '"restrict":"true"}}\' "{server_url}/api/datasets/:persistentId/add?persistentId={persistent_id}"'
        api_call = api_call.format(api_token=self.api_token,
                                   server_url=self.host,
                                   persistent_id=persistent_id,
                                   data_file='mock-datafile')

        upload_error = ""
        for df in pending_files:
            upload_string = api_call.replace("mock-datafile",
                                             df.get("file_path", str()))
            try:
                receipt = subprocess.check_output(upload_string, shell=True)
                receipt = json.loads(receipt.decode('utf-8'))
            except Exception as e:
                exception_message = "Error uploading file " + df.get(
                    "file_path", str()) + " : " + str(e)
                self.report_error(exception_message)
                upload_error = upload_error + "\n" + exception_message
            else:
                if receipt.get("status", str()).lower() in ("ok", "200"):
                    df["upload_status"] = True
                else:
                    exception_message = "Error uploading file " + df.get(
                        "file_path", str()) + " : " + str(receipt)
                    self.report_error(exception_message)
                    upload_error = upload_error + "\n" + exception_message

        # if all files uploaded, mark submission as complete
        pending_files = [
            x for x in pending_files if x.get("upload_status", False) is False
        ]

        if pending_files:
            return {"status": 404, "message": upload_error}

        sub['complete'] = True
        sub['completed_on'] = datetime.now()
        sub['target_id'] = sub.pop('_id', self.submission_id)
        Submission().save_record(dict(), **sub)

        return True

    def _get_connection(self):
        dvurl = self.host['url']
        apikey = self.host['apikey']
        dvurl = self.truncate_url(dvurl)
        c = Connection(dvurl, apikey)
        return c

    def _get_dataverse(self, profile_id):
        # create new dataverse if none already exists
        u = data_utils.get_current_user()
        # create new dataverse if none exists already
        dv_details = Profile().check_for_dataverse_details(profile_id)
        if not dv_details:
            # dataverse = connection.create_dataverse(dv_alias, '{0} {1}'.format(u.first_name, u.last_name), u.email)
            dv_details = self._create_dataverse(profile_id)
            Profile().add_dataverse_details(profile_id, dv_details)

        return dv_details

    def _create_dataverse(self, meta, conn):
        alias = str(uuid.uuid4())
        email = ""
        for f in meta["fields"]:
            if f["dc"] == "dc.title":
                name = f["vals"][0]
            if f["dc"] == "dc.email":
                email = f["vals"][0]
        if email == "":
            u = ThreadLocal.get_current_user()
            email = u.email
        dv = conn.create_dataverse(alias, name, email)
        return dv

    def _create_dataset(self, meta, dv, conn):
        dv.create_dataset()
        x = self._make_dataset_xml(meta)
        Dataset.from_xml_file()

    def _get_dataset(self, profile_id, dataFile_ids, dataverse):
        # create new dataset if none exists already
        ds_details = Profile().check_for_dataset_details(profile_id)
        if not ds_details:
            ds_details = self._create_dataset(dataFile_ids=dataFile_ids,
                                              dataverse=dataverse)
            Profile().add_dataverse_dataset_details(profile_id, ds_details)
        return ds_details

    def _make_dataset_xml(self, sub):
        meta = sub['meta']

        # iterate through meta to get fields
        d = dict()
        datafile = DataFile().get_record(ObjectId(sub['bundle'][0]))
        df = datafile['description']['attributes']

        xml = '<?xml version="1.0"?>'
        xml = xml + '<entry xmlns="http://www.w3.org/2005/Atom" xmlns:dcterms="http://purl.org/dc/terms/">'
        xml = xml + '<dcterms:contributor>' + "*****@*****.**" + '</dcterms:contributor>'

        for item in meta["fields"]:

            if type(item["vals"]) == type(""):
                tail = item["dc"].split(".")[1]
                xml = xml + "<dcterms:" + tail + '>' + item[
                    "vals"] + "</dcterms:" + tail + '>'

            elif type(item["vals"] == type(list())):
                for val in item["vals"]:
                    tail = item["dc"].split(".")[1]
                    xml = xml + '<dcterms:' + tail + '>' + val + '</dcterms:' + tail + '>'

        xml = xml + "</entry>"
        path = os.path.dirname(datafile['file_location'])
        xml_path = os.path.join(path, 'xml.xml')
        with open(xml_path, 'w+') as f:
            f.write(xml)
        return xml_path

    def _update_submission_record(self,
                                  sub,
                                  dataset,
                                  dataverse,
                                  dv_storageIdentifier=None):
        # add mongo_file id
        acc = dict()
        acc['storageIdentifier'] = dv_storageIdentifier
        acc['mongo_file_id'] = dataset.id
        acc['dataset_doi'] = dataset.doi
        acc['dataset_edit_media_uri'] = dataset.edit_media_uri
        acc['dataset_edit_uri'] = dataset.edit_uri
        acc['dataset_is_deleted'] = dataset.is_deleted
        acc['dataset_title'] = dataset.title
        acc['dataverse_title'] = dataset.dataverse.title
        acc['dataverse_alias'] = dataset.dataverse.alias
        acc['dataset_id'] = dataset._id
        # save accessions to mongo profile record
        sub['accessions'] = acc
        sub['complete'] = True
        sub['target_id'] = str(sub.pop('_id'))
        Submission().save_record(dict(), **sub)
        Submission().mark_submission_complete(sub['target_id'])
        return True

    def _listize(list):
        # split list by comma
        if list == '':
            return None
        else:
            return list.split(',')

    def publish_dataverse(self, sub_id):
        # get url for dataverse
        self.host = Submission().get_dataverse_details(sub_id)
        self.headers = {'X-Dataverse-key': self.host['apikey']}
        submission = Submission().get_record(sub_id)
        dvAlias = submission['accessions']['dataverse_alias']
        dsId = submission['accessions']['dataset_id']
        conn = self._get_connection()
        dv = conn.get_dataverse(dvAlias)
        # ds = dv.get_dataset_by_doi(dsDoi)
        if not dv.is_published:
            dv.publish()
        # POST http://$SERVER/api/datasets/$id/actions/:publish?type=$type&key=$apiKey
        url = submission['destination_repo']['url']
        url = url + '/api/datasets/' + str(
            dsId) + '/actions/:publish?type=major'
        print(url)
        resp = requests.post(url,
                             data={
                                 'type': 'major',
                                 'key': self.host['apikey']
                             },
                             headers=self.headers)
        if resp.status_code != 200 or resp.status_code != 201:
            raise OperationFailedError('The Dataset could not be published. ' +
                                       resp.content)

        doc = Submission().mark_as_published(sub_id)

        return doc

    def publish_dataset(self, dataset_id):
        url = self.host['url'] + '/api/datasets/' + str(
            dataset_id) + '/actions/:publish?type=major'

        resp = requests.post(url,
                             data={
                                 'type': 'major',
                                 'key': self.host['apikey']
                             },
                             headers=self.headers)

        if resp.status_code not in (200, 201):
            raise OperationFailedError('Dataset could not be published. ' +
                                       resp.content)
            return False

        return True

    def dc_dict_to_dc(self, sub_id):
        # get file metadata, call converter to strip out dc fields
        s = Submission().get_record(ObjectId(sub_id))
        f_id = s["bundle"][0]
        items = CgCoreSchemas().extract_repo_fields(str(f_id), "dataverse")
        temp_id = "copo:" + str(sub_id)
        # add the submission_id to the dataverse metadata to allow backwards treversal from dataverse
        items.append({
            "dc": "dc.relation",
            "copo_id": "submission_id",
            "vals": temp_id
        })
        Submission().update_meta(sub_id, json.dumps(items))

    def get_registered_types(self):
        """
        function uses a schema mapping of Dataverse types to drive conversion from cgcore to dataverse metadata
        Schema source: https://docs.google.com/spreadsheets/d/13HP-jI_cwLDHBetn9UKTREPJ_F4iHdAvhjmlvmYdSSw/edit#gid=0
        :return:
        """

        df = pd.read_csv(
            os.path.join(RESOLVER["cg_core_utils"],
                         'dataverse_schema_mapping.csv'))

        df.value = df['value'].fillna('')
        df.parent = df['parent'].fillna('')
        df.copo_id = df['copo_id'].fillna('')
        df.rename(index=str,
                  columns={"dataverse_id": "typeName"},
                  inplace=True)

        Attribute = namedtuple('Attribute', [
            'typeName', 'typeClass', 'multiple', 'value', 'parent', 'copo_id'
        ])
        registered_attibutes = [Attribute(**x) for x in df.to_dict('records')]

        return registered_attibutes

    def do_conversion(self):
        """
        function manages the conversion from CG Core to Dataverse types
        :return:
        """

        template = self.get_metadata_template()
        user_data = self.submission_record.get("meta",
                                               dict()).get("fields", list())
        citation_fragment = template["datasetVersion"]["metadataBlocks"][
            "citation"]
        citation_fragment["fields"] = self.get_dv_attributes(
            user_data=user_data)
        citation_fragment["displayName"] = self.get_display_name()

        return self.dump_metadata(template)

    def get_metadata_template(self):
        schemas_utils_paths = RESOLVER["cg_core_utils"]

        try:
            template = data_utils.json_to_pytype(
                os.path.join(schemas_utils_paths,
                             'dataverse_dataset_template.json'))
        except Exception as e:
            self.report_error("Couldn't retrieve Dataverse template. " +
                              str(e))
            raise

        return template

    def get_dv_attributes(self, user_data):
        """
        function sets attribute value for Dataverse fields from user data
        :param user_data:
        :return:
        """
        fields = list()

        for attrib in self.get_registered_types():
            # dependent attribute - ignore for now
            if attrib.parent:
                continue

            # predefined values
            elif type(attrib.value) is bool or attrib.value:
                field = dict(attrib._asdict())

                if attrib.multiple is True:
                    field['value'] = [field['value']]

                field.pop('parent', None)
                field.pop('copo_id', None)
                fields.append(field)

            # primitive and controlledVocabulary types
            elif attrib.typeClass in ["primitive", "controlledVocabulary"]:
                val = [
                    x["vals"] for x in user_data
                    if x.get("copo_id", str()) == attrib.copo_id
                ]
                if val:
                    val = val[0]
                field = self.get_dv_primitive(attrib, val)
                if field:
                    fields.append(field)

            # compound type
            elif attrib.typeClass == "compound":
                children = [
                    x for x in self.get_registered_types()
                    if x.parent == attrib.typeName
                ]

                if not children:
                    continue

                values = list()

                children_values = dict()
                for child in children:
                    # obtain predefined values
                    predefined_children_values = list()
                    if type(child.value) is bool or child.value:
                        vals = child.value
                        if attrib.multiple is True:
                            vals = [vals]

                        predefined_children_value = self.get_dv_primitive(
                            child, vals)

                        if predefined_children_value:
                            predefined_children_values.append(
                                predefined_children_value)

                        continue

                    vals = [
                        x["vals"] for x in user_data
                        if x.get("copo_id", str()) == child.copo_id
                    ]

                    if vals:
                        vals = vals[0]
                        if not isinstance(vals, list):
                            vals = [vals]

                        for indx, vv in enumerate(vals):
                            children_values.setdefault(indx, []).append(
                                self.get_dv_primitive(child, vv))

                for entry in children_values:
                    new_dict = dict()
                    for descendant in children_values[entry]:
                        new_dict[descendant["typeName"]] = descendant

                    # add predefined children values
                    for descendant in predefined_children_values:
                        new_dict[descendant["typeName"]] = descendant

                    values.append(new_dict)

                field = self.get_dv_primitive(
                    attrib, [1])  # pass any value to generate parent dict
                field["value"] = values
                fields.append(field)

        return fields

    def get_dv_primitive(self, attrib, val):
        """
        function returns schema fragment for a dataverse primitive type, given val
        :param attrib:
        :param val:
        :return:
        """

        field = dict()

        if isinstance(val, list) and attrib.multiple is False:
            value = val[0]
        elif not isinstance(val, list) and attrib.multiple is True:
            value = [val]
        else:
            value = val

        if value:
            field = dict(attrib._asdict())
            field['value'] = value
            field.pop('parent', None)
            field.pop('copo_id', None)

        return field

    def get_display_name(self):
        """
        sets display name for Dataset
        :return:
        """

        profile = DAComponent(component="profile").get_record(
            self.submission_record.get("profile_id", str()))

        return profile.get("title", str())

    def dump_metadata(self, dv_metadata):
        """
        function write converted metadata to file and returns the path on success
        :return:
        """

        # create submission file path
        if not os.path.exists(self.file_path):
            try:
                os.makedirs(self.file_path)
            except Exception as e:
                self.report_error("Error creating submission file path. " +
                                  str(e))
                raise

        path_to_json = os.path.join(self.file_path, 'dataset.json')

        try:
            with open(path_to_json, "w") as ff:
                ff.write(json.dumps(dv_metadata))
        except Exception as e:
            self.report_error("Error writing Dataset metadata to file. " +
                              str(e))
            raise

        return path_to_json

    def report_error(self, error_message):
        print(error_message)

        try:
            lg.log('Submission ID: ' + self.submission_id + " " +
                   error_message,
                   level=Loglvl.ERROR,
                   type=Logtype.FILE)
        except Exception as e:
            pass

        return False
示例#6
0
    def _do_aspera_transfer(self,
                            user_name=None,
                            password=None,
                            remote_path=None,
                            file_path=None,
                            path2library=None):

        lg.log('Starting aspera transfer',
               level=Loglvl.INFO,
               type=Logtype.FILE)

        kwargs = dict(target_id=self.submission_id,
                      commenced_on=str(datetime.now()))
        Submission().save_record(dict(), **kwargs)

        f_str = ' '.join(file_path)
        cmd = "./ascp -d -QT -l700M -L- {f_str!s} {user_name!s}:{remote_path!s}".format(
            **locals())
        lg.log(cmd, level=Loglvl.INFO, type=Logtype.FILE)
        os.chdir(path2library)

        try:
            thread = pexpect.spawn(cmd, timeout=None)
            thread.expect(["assword:", pexpect.EOF])
            thread.sendline(password)

            cpl = thread.compile_pattern_list([pexpect.EOF, '(.+)'])

            while True:
                i = thread.expect_list(cpl, timeout=None)
                if i == 0:  # EOF! Possible error point if encountered before transfer completion
                    print("Process termination - check exit status!")
                    break
                elif i == 1:
                    pexp_match = thread.match.group(1)
                    prev_file = ''
                    tokens_to_match = [
                        "Mb/s", "status=success", "status=started"
                    ]
                    units_to_match = ["KB", "MB", "GB"]
                    rates_to_match = [
                        "Kb/s", "kb/s", "Mb/s", "mb/s", "Gb/s", "gb/s"
                    ]
                    time_units = ['d', 'h', 'm', 's']
                    end_of_transfer = False

                    if any(tm in pexp_match.decode("utf-8")
                           for tm in tokens_to_match):
                        transfer_fields = dict()
                        tokens = pexp_match.decode("utf-8").split(" ")
                        lg.log(tokens, level=Loglvl.INFO, type=Logtype.FILE)

                        # has a file transfer started?
                        if 'status=started' in tokens:
                            # get the target file and update transfer record
                            target_file = [
                                tk for tk in tokens
                                if tk[:5] == "file=" or tk[:7] == "source="
                            ]

                            for up_f in target_file:
                                up_f_1 = up_f.split("=")[1].strip('"')

                                # update file path and datafile id
                                transfer_fields["file_path"] = up_f_1

                                submission_record = Submission().get_record(
                                    self.submission_id)
                                bundle_meta = submission_record.get(
                                    "bundle_meta", list())

                                listed_file = [
                                    indx
                                    for indx, elem in enumerate(bundle_meta)
                                    if elem['file_path'] == up_f_1
                                ]

                                if listed_file:
                                    transfer_fields[
                                        "datafile_id"] = bundle_meta[
                                            listed_file[0]]["file_id"]

                            # get original file size
                            file_size_bytes = [
                                x for x in tokens
                                if len(x) > 5 and x[:4] == 'size'
                            ]
                            if file_size_bytes:
                                t = file_size_bytes[0].split("=")[1]
                                transfer_fields["file_size_bytes"] = size(
                                    int(t), system=alternative)

                        # extract other file transfer metadata
                        if 'ETA' in tokens:
                            # get %completed, bytes transferred, current time etc
                            pct_completed = [
                                x for x in tokens
                                if len(x) > 1 and x[-1] == '%'
                            ]
                            if pct_completed:
                                transfer_fields[
                                    "pct_completed"] = pct_completed[0][:-1]
                                print(
                                    str(self.transfer_token) + ":  " +
                                    transfer_fields["pct_completed"] +
                                    "% transferred")

                            # bytes transferred
                            bytes_transferred = [
                                x for x in tokens
                                if len(x) > 2 and x[-2:] in units_to_match
                            ]
                            if bytes_transferred:
                                transfer_fields[
                                    "bytes_transferred"] = bytes_transferred[0]

                            # transfer rate
                            transfer_rate = [
                                x for x in tokens
                                if len(x) > 4 and x[-4:] in rates_to_match
                            ]
                            if transfer_rate:
                                transfer_fields[
                                    "transfer_rate"] = transfer_rate[0]

                            # current time - this will serve as the last time an activity was recorded
                            transfer_fields["current_time"] = datetime.now(
                            ).strftime("%d-%m-%Y %H:%M:%S")

                        # has a file been successfully transferred?
                        if 'status=success' in tokens:
                            # get the target file and update its status in the submission record
                            target_file = [
                                tk for tk in tokens
                                if tk[:5] == "file=" or tk[:7] == "source="
                            ]

                            for up_f in target_file:
                                up_f_1 = up_f.split("=")[1].strip('"')
                                submission_record = Submission().get_record(
                                    self.submission_id)

                                bundle_meta = submission_record.get(
                                    "bundle_meta", list())
                                listed_file = [
                                    indx
                                    for indx, elem in enumerate(bundle_meta)
                                    if elem['file_path'] == up_f_1
                                ]
                                if listed_file:
                                    bundle_meta[
                                        listed_file[0]]["upload_status"] = True
                                    kwargs = dict(target_id=self.submission_id,
                                                  bundle_meta=bundle_meta)
                                    Submission().save_record(dict(), **kwargs)

                                    # is this the final file to be transferred?
                                    submission_record = Submission(
                                    ).get_record(self.submission_id)
                                    if "bundle_meta" in submission_record:
                                        pending_files = [
                                            x["file_id"] for x in
                                            submission_record['bundle_meta']
                                            if not x["upload_status"]
                                        ]

                                        if not pending_files:  # we are all done!
                                            transfer_fields[
                                                "transfer_status"] = "completed"
                                            transfer_fields[
                                                "pct_completed"] = '100'
                                            transfer_fields[
                                                "current_time"] = datetime.now(
                                                ).strftime("%d-%m-%Y %H:%M:%S")

                        # save collected metadata to the transfer record
                        RemoteDataFile().update_transfer(
                            self.transfer_token, transfer_fields)

            thread.close()
            lg.log('Aspera Transfer completed',
                   level=Loglvl.INFO,
                   type=Logtype.FILE)

        except OSError:
            transfer_fields = dict()
            transfer_fields["error"] = "Encountered problems with file upload."
            transfer_fields["current_time"] = datetime.now().strftime(
                "%d-%m-%Y %H:%M:%S")
            lg.log('File upload error! Submission ID: ' + self.submission_id,
                   level=Loglvl.ERROR,
                   type=Logtype.FILE)

            # save error to transfer record
            RemoteDataFile().update_transfer(self.transfer_token,
                                             transfer_fields)
            return False
        finally:
            pass

        self.context["ena_status"] = "files_transferred"
        return