Пример #1
0
    def __init__(self, path):
        super().__init__()
        self.path = path
        tabular = Tabular(self.path)
        self.skip_rows = tuple(key for keys in self.verticals.values() for key in keys)
        self.t = tabular
        l = list(tabular)
        if not l:
            # FIXME bad design, this try block is a workaround for bad handling of empty lists
            raise exc.NoDataError(self.path)

        self.orig_header, *rest = l
        header = Header(self.orig_header).data

        self.fail = False
        if self.to_index:
            for head in self.to_index:
                if head not in header:
                    log.error(f'\'{self.t.path}\' malformed header!')
                    self.fail = True

        if self.fail:
            self.bc = byCol(rest, header)
        else:
            self.bc = byCol(rest, header, to_index=self.to_index)
Пример #2
0
    def normalize(self, key, value):
        v = value.replace('\ufeff', '')  # FIXME utf-16 issue
        if v != value:  # TODO can we decouple encoding from value normalization?
            message = f"encoding feff error in '{self.path}'"
            log.error(message)
            self.addError(exc.EncodingError(message))

        if v.lower().strip() not in ('n/a', 'na', 'no'):  # FIXME explicit null vs remove from structure
            yield from getattr(self, key, self.default)(v)
Пример #3
0
    def setup(cls, *, local_only=False):
        # FIXME this is a mess
        """ make sure we have all datasources
            calling this again will refresh helpers
        """
        if hasattr(Integrator, '__setup') and Integrator.__setup:
            return  # already setup

        Integrator.__setup = True

        for _cls in cls.mro():
            if _cls != cls:
                if hasattr(_cls, 'setup'):
                    _cls.setup()

        dat.DatasetStructure.rate = cls.rate

        # unanchored helpers
        if cls.no_google or local_only:
            log.critical('no google no organ data')

            class FakeOrganSheet:
                modality = lambda v: None
                organ_term = lambda v: None
                award_manual = lambda v: None
                byCol = _byCol([['award', 'award_manual', 'organ_term'], []])
                techniques = lambda v: []
                protocol_uris = lambda v: []

            class FakeAffilSheet:
                def __call__(self, *args, **kwargs):
                    return

            cls.organs_sheet = FakeOrganSheet
            cls.affiliations = FakeAffilSheet()
        else:
            cls.organs_sheet = sheets.Organs()  # ipv6 resolution issues :/
            cls.affiliations = sheets.Affiliations()

        if cls.no_google:
            cls.organ = lambda award: None

        if local_only:
            cls.organ = lambda award: None
            cls.member = lambda first, last: None
        else:
            cls.organ = OrganData()
            if hasattr(State, 'member'):
                cls.member = State.member
            else:
                log.error('State missing member, using State seems '
                          'like a good idea until you go to multiprocessing')
                cls.member = lambda first, last: None
Пример #4
0
    def __init__(self, previous_pipeline, lifters, runtime_context):
        if hasattr(State, 'member'):
            self.member = State.member
        else:
            log.error('State missing member, using State seems '
                      'like a good idea until you go to multiprocessing')
            self.member = lambda first, last: None

        self.contributors = previous_pipeline.data
        self.runtime_context = runtime_context
        self.dataset_id = runtime_context.id
        self.dsid = runtime_context.uri_api  # FIXME need a BlackfynnId class
        self.lifters = lifters
Пример #5
0
    def counts(self):
        if not hasattr(self, '_counts'):
            size = 0
            dirs = 0
            files = 0
            need_meta = []
            if not self.is_dir():
                gen = self,

            else:
                gen = self.rchildren

            for c in gen:
                if c.is_dir():
                    dirs += 1
                else:
                    files += 1  # testing for broken symlinks is hard
                    try:
                        maybe_size = c.cache.meta.size
                    except AttributeError as e:
                        log.error(f'no cache or no meta for {c}\n{e}')
                        continue

                    if maybe_size is None:
                        need_meta.append(c)
                    else:
                        size += maybe_size

            if need_meta and self._refresh_on_missing:
                nl = '\n'
                log.info(
                    f'refreshing {len(need_meta)} files with missing metadata in {self}'
                    f'\n{nl.join(_.as_posix() for _ in need_meta)}')
                new_caches = Async(rate=self.rate)(deferred(c.cache.refresh)()
                                                   for c in need_meta)
                for c in new_caches:  # FIXME first time around meta doesn't get updated ??
                    if c is None:
                        continue  # file was deleted (logged previously)

                    if c.meta is None:
                        log.critical(f'missing metdata! {c}')
                        continue

                    size += c.meta.size

            self._counts = dict(size=FileSize(size), dirs=dirs, files=files)

        return self._counts
Пример #6
0
    def _param(self, value):
        try:
            pv = pyru.UnitsParser(value).asPython()
        except pyru.UnitsParser.ParseFailure as e:
            caller_name = e.__traceback__.tb_frame.f_back.f_code.co_name
            msg = f'Unexpected and unhandled value {value} for {caller_name}'
            log.error(msg)
            self.addError(msg, pipeline_stage=self.__class__.__name__ + '.curation-error')
            return value

        #if not pv[0] == 'param:parse-failure':
        if pv is not None:  # parser failure  # FIXME check on this ...
            yield pv  # this one needs to be a string since it is combined below
        else:
            # TODO warn
            yield value
Пример #7
0
    def _protcur(self, protocol_uri, filter=lambda p: True):
        self.lazy_setup()
        protocol_uri = get_right_id(protocol_uri)
        gen = (p for p in protc if p.uri.startswith(protocol_uri) and filter(p))

        try:
            p = next(gen)
            yield p
            yield from gen
        except StopIteration:
            log.error(f'could not find annotations for {protocol_uri}')
            return

        if p.document.otherVersionUri:  # FIXME also maybe check /abstract?
            other_uri = p.document.otherVersionUri
            yield from (p for p in protc if p.uri.startswith(other_uri) and filter(p))
Пример #8
0
    def __init__(self, path):
        super().__init__()
        self.path = path
        if self._is_json:
            with open(self.path, 'rt') as f:
                try:
                    self._data_raw = json.load(f)
                except json.decoder.JSONDecodeError as e:
                    if not f.buffer.tell():
                        raise exc.NoDataError(self.path)
                    else:
                        raise exc.BadDataError(self.path) from e

            if isinstance(self._data_raw, dict):
                # FIXME this breaks downstream assumptions
                self._data_cache = {self.rename_key(k):tos(self.normalize(k, v))  # FIXME FIXME
                                    for k, v in self._data_raw.items()}

            return

        tabular = Tabular(self.path)
        self.skip_rows = tuple(key for keys in self.verticals.values() for key in keys)
        self.t = tabular
        l = list(tabular)
        if not l:
            # FIXME bad design, this try block is a workaround for bad handling of empty lists
            raise exc.NoDataError(self.path)

        self.orig_header, *rest = l
        header = Header(self.orig_header).data

        self.fail = False
        if self.to_index:
            for head in self.to_index:
                if head not in header:
                    log.error(f'\'{self.t.path}\' malformed header!')
                    self.fail = True

        if self.fail:
            try:
                self.bc = byCol(rest, header)
            except ValueError as e:
                raise exc.BadDataError(self.path) from e
        else:
            self.bc = byCol(rest, header, to_index=self.to_index)
Пример #9
0
    def triples_contributors(self, contributor, creator=False):
        try:
            dsid = self.dsid  # FIXME json reload needs to deal with this
        except BaseException as e:  # FIXME ...
            log.error(e)
            return

        s = rdflib.URIRef(contributor['id'])  # FIXME json reload needs to deal with this

        if 'blackfynn_user_id' in contributor:
            userid = rdflib.URIRef(contributor['blackfynn_user_id'])
            yield s, TEMP.hasBlackfynnUserId, userid

        yield s, a, owl.NamedIndividual
        yield s, a, sparc.Researcher
        yield s, TEMP.contributorTo, dsid
        converter = conv.ContributorConverter(contributor)
        yield from converter.triples_gen(s)
        if creator:
            yield s, TEMP.creatorOf, dsid
Пример #10
0
 def subpipeline_errors(self, errors):
     """ override this for pipeline specific error handling rules """
     for path, error, subpipeline_class in errors:
         log.error(f'{path}\n{error}\n{subpipeline_class}\n{self!r}')
Пример #11
0
    def setup(cls, *, local_only=False):
        # FIXME this is a mess
        """ make sure we have all datasources
            calling this again will refresh helpers
        """
        if hasattr(Integrator, '__setup') and Integrator.__setup:
            return  # already setup

        Integrator.__setup = True

        for _cls in cls.mro():
            if _cls != cls:
                if hasattr(_cls, 'setup'):
                    _cls.setup()

        dat.DatasetStructure.rate = cls.rate

        class FakeOrganSheet:
            modality = lambda v: None
            organ_term = lambda v: None
            award_manual = lambda v: None
            byCol = _byCol([['award', 'award_manual', 'organ_term'], []])
            techniques = lambda v: []
            protocol_uris = lambda v: []

        class FakeAffilSheet:
            def __call__(self, *args, **kwargs):
                return

        class FakeOverviewSheet:
            def __call__(self, *args, **kwargs):
                return

        # unanchored helpers
        if cls.no_google or local_only:
            log.critical('no google no organ data')
            cls.organs_sheet = FakeOrganSheet
            cls.affiliations = FakeAffilSheet()
            cls.overview_sheet = FakeOverviewSheet()
        else:
            # ipv6 resolution issues :/ also issues with pickling
            #cls.organs_sheet = sheets.Organs(fetch_grid=True)  # this kills parallelism
            cls.organs_sheet = sheets.Organs(
            )  # if fetch_grid = False @ class level ok
            cls.affiliations = sheets.Affiliations()
            cls.overview_sheet = sheets.Overview()

            # zap all the services (apparently doesn't help)
            # yep, its just the organ sheet, these go in and out just fine
            #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service'):
            #delattr(sheets.Sheet, '_Sheet__spreadsheet_service')
            #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro'):
            #delattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro')

            #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet):
            #if hasattr(s, '_spreadsheet_service'):
            #delattr(s, '_spreadsheet_service')

            # YOU THOUGHT IT WAS GOOGLE IT WAS ME ORGANS ALL ALONG!
            #cls.organs_sheet = FakeOrganSheet  # organs is BAD

            #cls.affiliations = FakeAffilSheet()  # affiliations is OK
            #cls.overview_sheet = FakeOverviewSheet()  # overview is OK

            #breakpoint()
            # remove byCol which is unpickleable (super duper sigh)
            #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet):
            #if hasattr(s, 'byCol'):
            #delattr(s, 'byCol')

        if cls.no_google:
            cls.organ = lambda award: None

        if local_only:
            cls.organ = lambda award: None
            cls.member = lambda first, last: None
        else:
            cls.organ = OrganData()
            if hasattr(State, 'member'):
                cls.member = State.member
            else:
                log.error('State missing member, using State seems '
                          'like a good idea until you go to multiprocessing')
                cls.member = lambda first, last: None
Пример #12
0
 def __iter__(self):
     try:
         yield from self.normalize(getattr(self, self.file_extension)())
     except UnicodeDecodeError as e:
         log.error(f'{self.path.as_posix()!r} {e}')