예제 #1
0
 def organ(self, award_number):
     if award_number in self.manual and award_number not in self.sourced:
         log.warning(f'used manual organ mapping for {award_number}')
     try:
         return self.award_to_organ[award_number]
     except KeyError as e:
         logd.error(f'bad award_number {award_number}')
예제 #2
0
    def get(self, uri):
        #juri = uri + '.json'
        logd.info(uri)
        log.debug('going to network for protocols')
        resp = requests.get(uri, headers=self._pio_header)
        #log.info(str(resp.request.headers))
        if resp.ok:
            try:
                j = resp.json()  # the api is reasonably consistent
            except BaseException as e:
                log.exception(e)
                breakpoint()
                raise e
            return j
        else:
            try:
                j = resp.json()
                sc = j['status_code']
                em = j['error_message']
                msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}'
                logd.error(msg)
                self.addError(msg)
                # can't return here because of the cache
            except BaseException as e:
                log.exception(e)

            logd.error(f'protocol no access {uri} {self.id!r}')
예제 #3
0
    def path_metadata(cls, path_dataset, manifests, xmls):
        path_metadata = []
        scaffolds = [
        ]  # FIXME need a better abstraction for additional known types e.g. the mbf segmentations
        for manifest in manifests:
            if 'contents' in manifest:
                contents = manifest['contents']
                if 'manifest_records' in contents:
                    drp = manifest['dataset_relative_path']
                    _should_log = True
                    for record in contents['manifest_records']:
                        try:
                            lifted, _should_log = cls._lift_mr(
                                path_dataset, drp, record, _should_log)
                        except FileNotFoundError as e:
                            logd.error(e)
                            continue  # FIXME need this in the errors record
                        except exc.BadDataError as e:
                            logd.error(e)
                            continue  # FIXME need this in the errors record
                        path_metadata.append(lifted)
                        if 'errors' not in lifted:
                            cls._scaffolds(lifted, scaffolds)

        path_metadata.extend(xmls['xml'])
        # TODO try to construct/resolve the referent paths in these datasets as well
        # getting the path json metadata will embed errors about missing files for us
        return path_metadata, scaffolds
예제 #4
0
    def _process(self, contributor):
        # get member if we can find them
        he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data')
        if 'name' in contributor and 'first_name' in contributor:
            name = contributor['name']
            if ';' in name:
                msg = f'Bad symbol in name {name!r}'
                he.addError(msg)
                logd.error(msg)

            fn = contributor['first_name']
            ln = contributor['last_name']
            if ' ' in fn:
                fn, mn = fn.split(' ', 1)
                contributor['middle_name'] = mn
                contributor['first_name'] = fn

            if ' ' in ln:
                msg = f'Malformed last_name {ln!r}'
                he.addError(msg)
                logd.error(msg)
                ln = ln.replace(' ', '-')

            failover = f'{fn}-{ln}'
            member = self.member(fn, ln)

            if member is not None:
                userid = OntId('https://api.blackfynn.io/users/' + member.id)
                contributor['blackfynn_user_id'] = userid

        else:
            member = None
            failover = 'no-orcid-no-name'
            log.warning(f'No name!' + lj(contributor))

        orcid = None
        if 'contributor_orcid_id' in contributor:
            orcid = contributor['contributor_orcid_id']
            if type(orcid) == str and 'orcid.org' in orcid:
                orcid = OrcidId(orcid)  # FIXME reloading from json

            if isinstance(orcid, OrcidId):
                s = orcid
            else:  # it's not an orcid or its a bad orcid
                orcid = None

        if orcid is None:
            if member is not None:
                s = userid
            else:
                log.debug(lj(contributor))
                s = OntId(self.dsid + '/contributors/' + failover)

        contributor['id'] = s
        he.embedErrors(contributor)
예제 #5
0
    def _lift_mr(path_dataset, dataset_relative_path, record, should_log):
        parent = dataset_relative_path.parent
        if 'filename' not in record:
            msg = f'filename missing from record in {dataset_relative_path}'
            raise exc.BadDataError(msg)

        record_drp = parent / record['filename']

        # FIXME this is validate move elsewhere ??? but this is also
        # where we want to embed data about the path ...
        # but I supposed we could inject errors later ? this is an
        # extremely obscure place to inject these ...
        # and to do that we need access to the path
        # and it is clear given that we passed in THIS_PATH

        # FIXME TODO how to deal with sparse datasets
        _record_path = path_dataset / record_drp  # do not include in export

        _cache = _record_path.cache
        if _cache is None:
            lifted = _record_path._jsonMetadata(
            )  # will produce the error for us
        else:
            lifted = _cache._jsonMetadata()

        if 'errors' in lifted:
            he = HasErrors(pipeline_stage='Derives._lift_mr')
            _message = lifted['errors'].pop()['message']
            # FIXME pretty sure that path in addError is used for both
            # the json path and the file system path
            message = ('Non-existent path listed in manifest '
                       f'{dataset_relative_path}\n{_message}')
            if he.addError(message, blame='submission', path=_record_path):
                if should_log:
                    should_log = False
                    logd.error(message)

            he.embedErrors(lifted)

        lifted['prov:wasDerivedFrom'] = (
            # have to reattach path_dataset because Path.cwd() may not be
            # the dataset root (usually the organizaiton root)
            path_dataset / dataset_relative_path).cache_identifier
        lifted['dataset_relative_path'] = record_drp
        lifted['manifest_record'] = {
            k: v
            for k, v in record.items() if k != 'filetype'
        }
        if 'additional_types' in record:
            # FIXME TODO make sure that the mimetypes match
            # FIXME currently a string, either change name or make a list?
            lifted['mimetype'] = record['additional_types']

        return lifted, should_log
예제 #6
0
    def subpipeline_errors(self, errors):
        paths = []
        saf = ['samples_file']
        suf = ['subjects_file']
        for_super = []
        for path, error, subpipeline_class in errors:
            paths.append(path)
            if path not in (saf, suf):
                for_super.append((path, error, subpipeline_class))

        if saf not in paths and suf not in paths:
            logd.error(f'samples_file nor subjects_file')

        super().subpipeline_errors(for_super)
예제 #7
0
    def _abstracted_paths(self, name_prefix, glob_type=None):
        """ A bottom up search for the closest file in the parent directory.
            For datasets, if the bids root and path do not match, use the bids root.
            In the future this needs to be normalized because the extra code required
            for dealing with the intervening node is quite annoying to maintain.
        """
        if glob_type is None:
            glob_type = self.default_glob

        path = self
        if (self.cache and
            self.cache.is_dataset and
            self.bids_root is not None and
            self.bids_root != self):
            path = self.bids_root

        first = name_prefix[0]
        cased_np = '[' + first.upper() + first + ']' + name_prefix[1:]  # FIXME warn and normalize
        glob = getattr(path, glob_type)
        gen = glob(cased_np + '*.*')

        try:
            path = next(gen)
            for path in chain((path,), gen):
                if path.is_broken_symlink():
                    log.info(f'fetching unretrieved metadata path {path.as_posix()!r}'
                             '\nFIXME batch these using async in cli export ...')
                    path.cache.fetch(size_limit_mb=path.cache.meta.size.mb + 1)

                if path.suffix in path.stem:
                    msg = f'path has duplicate suffix {path.as_posix()!r}'
                    self.addError(msg)
                    logd.error(msg)

                if path.name[0].isupper():
                    msg = f'path has bad casing {path.as_posix()!r}'
                    self.addError(msg)
                    logd.error(msg)

                yield path

        except StopIteration:
            if (self.cache.parent.meta is not None and
                self.parent.cache != self.cache.anchor and
                self.parent != self):
                yield from getattr(self.parent, name_prefix + '_paths')
예제 #8
0
    def contributor_orcid_id(self, value):
        # FIXME use schema
        v = value.replace(' ', '')
        if not v:
            return
        if v.startswith('http:'):
            v = v.replace('http:', 'https:', 1)

        if not (v.startswith('ORCID:') or v.startswith('https:')):
            v = v.strip()
            if not len(v):
                return
            elif v == '0':  # FIXME ? someone using strange conventions ...
                return
            elif len(v) != 19:
                msg = f'orcid wrong length {value!r} {self.t.path.as_posix()!r}'
                self.addError(OrcidId.OrcidLengthError(msg))
                logd.error(msg)
                return

            v = 'ORCID:' + v

        else:
            if v.startswith('https:'):
                _, numeric = v.rsplit('/', 1)
            elif v.startswith('ORCID:'):
                _, numeric = v.rsplit(':', 1)

            if not len(numeric):
                return
            elif len(numeric) != 19:
                msg = f'orcid wrong length {value!r} {self.t.path.as_posix()!r}'
                self.addError(OrcidId.OrcidLengthError(msg))
                logd.error(msg)
                return

        try:
            #log.debug(f"{v} '{self.t.path}'")
            orcid = OrcidId(v)
            if not orcid.checksumValid:
                # FIXME json schema can't do this ...
                msg = f'orcid failed checksum {value!r} {self.t.path.as_posix()!r}'
                self.addError(OrcidId.OrcidChecksumError(msg))
                logd.error(msg)
                return

            yield orcid

        except (OntId.BadCurieError, OrcidId.OrcidMalformedError) as e:
            msg = f'orcid malformed {value!r} {self.t.path.as_posix()!r}'
            self.addError(OrcidId.OrcidMalformedError(msg))
            logd.error(msg)
            yield value
예제 #9
0
    def csv(self, delimiter=','):
        for encoding in ('utf-8', 'latin-1'):
            try:
                with open(self.path, 'rt', encoding=encoding) as f:
                    for row in csv.reader(f, delimiter=delimiter):
                        if row:
                            yield row
                        else:
                            message = f'empty row in {self.path.as_posix()!r}'
                            self.addError(message)
                            logd.error(message)

                if encoding != 'utf-8':
                    message = f'encoding bad {encoding!r} {self.path.as_posix()!r}'
                    self.addError(exc.EncodingError(message))
                    logd.error(message)
                return
            except UnicodeDecodeError:
                continue
예제 #10
0
    def xlsx(self):
        kwargs = {
            'delimiter' : '\t',
            'skip_empty_lines' : True,
            'outputencoding': 'utf-8',
        }
        sheetid = 1
        xlsx2csv = Xlsx2csv(self.path.as_posix(), **kwargs)
        ns = len(xlsx2csv.workbook.sheets)
        if ns > 1:
            message = f'too many sheets ({ns}) in {self.path.as_posix()!r}'
            self.addError(exc.EncodingError(message))
            logd.error(message)

        f = io.StringIO()
        try:
            xlsx2csv.convert(f, sheetid)
            f.seek(0)
            gen = csv.reader(f, delimiter='\t')
            yield from gen
        except SheetNotFoundException as e:
            log.warning(f'Sheet weirdness in{self.path}')
            log.warning(str(e))
예제 #11
0
    def _get_protocol_json(self, uri):
        #juri = uri + '.json'
        logd.info(
            uri.identifier if isinstance(uri, idlib.Stream) else uri)  # FIXME
        pi = idlib.get_right_id(uri)
        if 'protocols.io' in pi:
            pioid = pi.slug  # FIXME normalize before we ever get here ...
            log.info(pioid)
        else:
            msg = f'protocol uri is not from protocols.io {pi} {self.id}'
            logd.error(msg)
            self.addError(msg)
            return

        #uri_path = uri.rsplit('/', 1)[-1]
        apiuri = 'https://www.protocols.io/api/v3/protocols/' + pioid
        #'https://www.protocols.io/api/v3/groups/sparc/protocols'
        #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top'
        #print(apiuri, header)
        log.debug('going to network for protocols')
        resp = requests.get(apiuri, headers=self._pio_header)
        #log.info(str(resp.request.headers))
        if resp.ok:
            try:
                j = resp.json()  # the api is reasonably consistent
            except BaseException as e:
                log.exception(e)
                breakpoint()
                raise e
            return j
        else:
            try:
                j = resp.json()
                sc = j['status_code']
                em = j['error_message']
                msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}'
                logd.error(msg)
                self.addError(msg)
                # can't return here because of the cache
            except BaseException as e:
                log.exception(e)

            logd.error(f'protocol no access {uri} {self.id!r}')
예제 #12
0
    def data(self):  # TODO candidate for memory.cache
        if hasattr(self, '_data_cache'):
            return self._data_cache

        index_col, *_ = self.to_index
        out = {}
        if not hasattr(self.bc, index_col):
            msg = f'{self.path.as_posix()!r} maformed header!'
            self.addError(msg)
            logd.error(msg)
            self.embedErrors(out)
            self._data_cache = out
            return out

        ic = list(getattr(self.bc, index_col))
        nme = Header(ic).data
        nmed = {v:normk for normk, v in zip(nme, ic)}

        for v, nt in self.bc._byCol__indexes[index_col].items():
            if v != index_col:
                normk = nmed[v]
                if normk not in self.skip_rows:
                    _value = tuple(normv for key, value in zip(nt._fields, nt)
                                   if key not in self.skip_cols and value
                                   for normv in self.normalize(normk, value)
                                   if normv)
                    value = tuple(set(_value))
                    if len(value) != len(_value):
                        # TODO counter to show the duplicate values
                        msg = f'duplicate values in {normk} TODO {self.path.as_posix()!r}'
                        self.addError(msg)
                        logd.error(msg)

                    if normk in self.max_one:  # schema will handle this ..
                        if not value:
                            #log.warning(f"missing value for {normk} '{self.t.path}'")
                            pass
                        elif len(value) > 1:
                            msg = f'too many values for {normk} {value} {self.path.as_posix()!r}'
                            self.addError(msg)
                            logd.error(msg)
                            # FIXME not selecting the zeroth element here breaks the schema assumptions
                            #value = 'AAAAAAAAAAA' + '\n|AAAAAAAAAAA|\n'.join(value)
                            #value = 'ERROR>>>' + ','.join(value)
                            # just leave it
                        else:
                            value = value[0]  # FIXME error handling etc.

                    if value:
                        out[normk] = value

        def merge(tup):
            out = {}
            for a, b in tup:
                if a not in out:
                    out[a] = b
                elif a and not isinstance(b, tuple):
                    out[a] = out[a], b
                else:
                    out[a] += b,

            self._data_cache = out
            return out

        for key, keys in self.verticals.items():
            gen = (merge([(self.rename_key(k, key), normv)
                          for k, value in zip(nme, values)
                          if k in keys and value
                          for normv in self.normalize(k, value)
                          if normv])
                   for head, *values in self.bc.cols
                   if head not in self.skip_cols)
            value = tuple(_ for _ in gen if _)
            if value:
                out[key] = value

        self.embedErrors(out)
        self._data_cache = out
        return out
예제 #13
0
    def validate_structure(path, dir_structure, subjects, samples):

        he = HasErrors(pipeline_stage='Derives.validate_structure')

        # FIXME TODO handle pools as well and figure out cases where subjects/samples are metadata only

        # for dataset templates of the 1.* series
        # general approach: set of all specimen ids and set of all
        # folder names take the ones that match ignore the known ok
        # that do not, and warn on all the rest that do not match
        # and that are not inside a known specimen or subject folder

        valid_top_123 = (
            'source',
            'primary',
            'derivative',  # FIXME not here :/ schema somehow?
            'code',
            'docs',
            'protocol')

        def top_level(drp):
            return drp.parent.name == '' and drp.name in valid_top_123

        # absolute_paths = [path / pblob['dataset_relative_path'] for pblob in dir_structure]
        dd = defaultdict(list)
        for pblob in dir_structure:
            drp = pblob['dataset_relative_path']
            p = drp.parts
            dd[p[-1]].append((len(p), drp, p[::-1]))

        dirs = {
            k: av
            for k, vs in dd.items()
            for av in ([v for v in vs if not top_level(v[1])], )
            # cull empty in a single step
            if av
        }

        # subject_id could be missing, but we filter failures on all of
        # those so in theory we shouldn't need to handle it as this stage
        subs = {s['subject_id']: s for s in subjects}
        dd = defaultdict(list)
        for s in samples:
            dd[s['sample_id']].append(s)
        samps = dict(dd)

        union_sub = set(dirs) | set(subs)
        inter_sub = set(dirs) & set(subs)

        records = []
        done_dirs = set()
        done_specs = set()
        if inter_sub == set(subs):
            for subject_id, blob in subs.items():
                done_dirs.add(subject_id)
                done_specs.add(subject_id)
                records.append({
                    'type': 'SubjectDirs',
                    # have to split the type because we can't recover
                    # the type using just the specimen id (sigh)
                    # and we need it to set the correct prefix (sigh)
                    'specimen_id': subject_id,
                    'dirs': [d[1] for d in dirs[subject_id]]
                })
        else:
            # FIXME not all subjects have folders there may be samples
            # that have folders but not subjects ??? don't wan't to force
            # metadata structure onto folder structure but it complicates
            # the implementation again ... probably worth it in this case
            logd.warning('miscount subject dirs, TODO')
            pass

        union_sam = set(dirs) | set(samps)
        inter_sam = set(dirs) & set(samps)

        template_version_less_than_2 = True  # FIXME TODO
        # FIXME this is where non-uniqueness of sample ids becomes a giant pita
        if inter_sam == set(samps):
            for sample_id, blob in samps.items():
                if len(blob) > 1:
                    # FIXME TODO this means that we need to fail over to the primary keys
                    msg = f'sample_id is not unique! {sample_id}\n{blob}'
                    if he.addError(msg, blame='submission', path=path):
                        logd.error(msg)
                    continue

                if template_version_less_than_2:  # FIXME this is sure the cause an error at some point
                    done_dirs.add((blob[0]['subject_id'], sample_id))
                    done_specs.add(blob[0]['primary_key'])
                else:
                    done_dirs.add(sample_id)
                    done_specs.add(sample_id)
                records.append({
                    'type': 'SampleDirs',
                    # have to split the type because we can't recover
                    # the type using just the specimen id (sigh)
                    # and we need it to set the correct prefix (sigh)
                    'specimen_id': sample_id,
                    'dirs': [d[1] for d in dirs[sample_id]]
                })
        else:
            logd.warning('miscount sample dirs, TODO')
            bad_dirs = []
            if template_version_less_than_2:
                # handle old aweful nonsense
                # 1. construct subject sample lookups using tuple
                # 2. try to construct subject sample id pairs
                for sample_id, blobs in samps.items():
                    for blob in blobs:
                        if sample_id in dirs:
                            candidates = dirs[sample_id]
                            # TODO zero candidates error
                            actual = []
                            for level, drp, rparts in candidates:
                                if level < 2:
                                    msg = (
                                        f'Bad location for specimen folder! {drp}'
                                    )
                                    if he.addError(msg,
                                                   blame='submission',
                                                   path=path):
                                        logd.error(msg)
                                    bad_dirs.append(dirs.pop(sample_id))
                                    continue
                                p_sample_id, p_subject_id, *p_rest = rparts
                                if level < 3:
                                    # p_subject_id will be primary derivatie or source
                                    log.warning(f'TODO new structure {drp}')

                                assert sample_id == p_sample_id  # this should always be true
                                subject_id = blob['subject_id']
                                if subject_id == p_subject_id:
                                    id = blob['primary_key']
                                    done_dirs.add((subject_id, p_sample_id))
                                    done_specs.add(id)
                                    actual.append(drp)

                            if actual:
                                records.append({
                                    'type': 'SampleDirs',
                                    # have to split the type because we can't recover
                                    # the type using just the specimen id (sigh)
                                    # and we need it to set the correct prefix (sigh)
                                    'specimen_id': id,
                                    'dirs': actual,
                                })
                    else:
                        msg = f'No folder for sample {sample_id}'
                        if he.addError(msg, blame='submission', path=path):
                            logd.error(msg)
            else:
                pass  # TODO that's an error!

        usamps = set(v['primary_key'] for vs in samps.values() for v in vs)
        udirs = set(
            nv for path_name, subpaths in dirs.items()
            for nv in (((subpath[-1][1], path_name) for subpath in subpaths
                        )  # -1 rpaths 1 parent  # XXX FIXME clearly wrong ???
                       if path_name in samps else (path_name, )))
        not_done_specs = (set(subs) | usamps) - set(done_specs)
        not_done_dirs = set(udirs) - set(done_dirs)

        obj = {}

        if records:
            obj['records'] = records
        else:
            pass  # TODO embed an error

        if not_done_specs:
            msg = ('There are specimens that have no corresponding '
                   f'directory!\n{not_done_specs}')
            if he.addError(msg, blame='submission', path=path):
                logd.error(msg)

        if not_done_dirs:
            msg = ('There are directories that have no corresponding '
                   f'specimen!\n{not_done_dirs}')
            if he.addError(msg, blame='submission', path=path):
                logd.error(msg)

        he.embedErrors(obj)
        return obj,
예제 #14
0
    def _process(self, contributor):
        # get member if we can find them
        he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data')
        if 'contributor_name' in contributor and 'first_name' in contributor:
            name = contributor['contributor_name']
            if ';' in name:
                msg = f'Bad symbol in name {name!r}'
                he.addError(msg)
                logd.error(msg)

            fn = contributor['first_name']
            ln = contributor['last_name']
            if ' ' in fn:
                fn, mn = fn.split(' ', 1)
                mn, _mn = mn.rstrip('.'), mn
                if mn != _mn:
                    he.addError(f'Middle initials don\'t need periods :) {name!r}',
                                logfunc=logd.error)
                contributor['middle_name'] = mn
                contributor['first_name'] = fn

            if ' ' in ln:
                msg = f'Malformed last_name {ln!r}'
                he.addError(msg)
                logd.error(msg)
                ln = ln.replace(' ', '-')

            failover = f'{fn}-{ln}'
            member = self.member(fn, ln)

            if member is not None:
                userid = OntId('https://api.blackfynn.io/users/' + member.id)
                contributor['blackfynn_user_id'] = userid

        else:
            member = None
            failover = 'no-orcid-no-name'
            log.warning(f'No name!' + lj(contributor))

        orcid = None
        if 'contributor_orcid_id' in contributor:
            orcid = contributor['contributor_orcid_id']
            if type(orcid) == str and 'orcid.org' in orcid:
                orcid = idlib.Orcid(orcid)  # FIXME reloading from json

            if isinstance(orcid, idlib.Orcid):
                s = orcid
            else:  # it's not an orcid or its a bad orcid
                orcid = None

        if orcid is None:
            if member is not None:
                s = userid
            else:
                log.debug(lj(contributor))
                s = OntId(self.dsid + '/contributors/' + failover)

        contributor['id'] = s
        he.embedErrors(contributor)

        # lifting + adding
        if 'contributor_affiliation' in contributor:
            ca = contributor['contributor_affiliation']
            maybe_ror = self.lifters.affiliations(ca)
            if maybe_ror is not None:
                contributor['affiliation'] = maybe_ror