Пример #1
0
class _RunScriptTask(UploadRunTask):
    log_dest = pv.PathParam(significant=False)

    script_name: str

    @property
    def label(self) -> str:
        return self.script_name
Пример #2
0
class MigrateUpload(UploadRunTask):
    """
    TODO: explain of why run() is trivial, i.e. why we
    don't get an upload_status record until the end. Or fix it.
    """
    upload_id = pv.IntParam()
    workspace_schema = pv.StrParam(default='HERON_ETL_1')
    i2b2_deid = pv.StrParam(default='BlueHeronData')
    db_url_deid = pv.StrParam()
    log_dest = pv.PathParam(significant=False)

    @property
    def label(self) -> str:
        return self.get_task_family()

    @property
    def source_cd(self) -> str:
        return self.workspace_schema

    def requires(self) -> Dict[str, luigi.Task]:
        if self.complete():
            return {}

        _configure_logging(self.log_dest)
        return {
            'id':
            CopyRecords(
                src_table=
                f'{self.workspace_schema}.observation_fact_{self.upload_id}',
                dest_table=f'{self.schema}.observation_fact',
                mode='append',
                db_url=self.db_url,
                db_url_dest=self.db_url,
                driver=self.driver,
                user=self.user,
                passkey=self.passkey),
            'deid':
            CopyRecords(
                src_table=
                f'{self.workspace_schema}.observation_fact_deid_{self.upload_id}',
                dest_table=f'{self.i2b2_deid}.observation_fact',
                mode='append',
                db_url=self.db_url,
                db_url_dest=self.db_url_deid,
                driver=self.driver,
                user=self.user,
                passkey=self.passkey),
        }

    def _upload_target(self) -> UploadTarget:
        return UploadTarget(self, self.schema, transform_name=self.task_id)

    def run_upload(self, conn: Connection, upload_id: int) -> None:
        pass
Пример #3
0
class NAACCR_FlatFile(ManualTask):
    """A NAACCR flat file is determined by the registry, export date,
    and version.
    """
    naaccrRecordVersion = pv.IntParam(default=180)
    dateCaseReportExported = pv.DateParam()
    npiRegistryId = pv.StrParam()
    testData = pv.BoolParam(default=False, significant=False)
    flat_file = pv.PathParam(significant=False)
    record_qty_min = pv.IntParam(significant=False, default=1)

    def check_version_param(self) -> None:
        """Only version 18 (180) is currently supported.
        """
        if self.naaccrRecordVersion != 180:
            raise NotImplementedError()

    def complete(self) -> bool:
        with task_action(self, 'complete') as ctx:
            result = self.complete_action()
            ctx.add_success_fields(result=result)
            return result

    def complete_action(self) -> bool:
        """Check the first record, assuming all the others have
        the same export date and registry NPI.
        """
        self.check_version_param()

        with self.flat_file.open() as records:
            record0 = records.readline()
            qty = 1 + sum(1 for _ in records.readlines())
        log.info('record qty: %d (> %d? %s)', qty, self.record_qty_min,
                 qty >= self.record_qty_min)

        vOk = self._checkItem(record0, 'naaccrRecordVersion',
                              str(self.naaccrRecordVersion))
        regOk = self._checkItem(record0, 'npiRegistryId', self.npiRegistryId)
        dtOk = self._checkItem(record0, 'dateCaseReportExported',
                               self.dateCaseReportExported.strftime('%Y%m%d'))

        if vOk and regOk and dtOk and qty >= self.record_qty_min:
            return True
        else:
            if self.testData:
                log.warn('ignoring failed FlatFile check')
                return True
            return False

    @classmethod
    def _checkItem(cls, record: str, naaccrId: str, expected: str) -> bool:
        '''
        >>> npi = '1234567890'
        >>> record0 = ' ' * 19 + npi
        >>> NAACCR_FlatFile._checkItem(record0, 'npiRegistryId', npi)
        True
        >>> NAACCR_FlatFile._checkItem(record0, 'npiRegistryId', 'XXX')
        False
        '''
        itemDef = tr_ont.NAACCR1.itemDef(naaccrId)
        [startColumn, length
         ] = [int(itemDef.attrib[it]) for it in ['startColumn', 'length']]
        startColumn -= 1
        actual = record[startColumn:startColumn + length]
        if actual != expected:
            log.warn('%s: expected %s [%s:%s] = {%s} but found {%s}',
                     cls.__name__, naaccrId, startColumn - 1,
                     startColumn + length, expected, actual)
        return actual == expected
Пример #4
0
class NAACCR_Ontology1(JDBCTask):
    table_name = pv.StrParam(default="NAACCR_ONTOLOGY")
    who_cache = pv.PathParam()
    z_design_id = pv.StrParam(
        default='2019-12-16 pystdlib %s' %
        _stable_hash(tr_ont.NAACCR_I2B2.ont_script.code),
        description='''
        mnemonic for latest visible change to output.
        Changing this causes task_id to change, which
        ensures the ontology gets rebuilt if necessary.
        '''.strip(),
    )
    naaccr_version = pv.IntParam(default=18)  # ISSUE: ignored?
    jdbc_driver_jar = pv.StrParam(significant=False)

    # based on custom_meta
    col_to_type = dict(
        c_hlevel='int',
        c_fullname='varchar(700)',
        c_name='varchar(2000)',
        c_visualattributes='varchar(3)',
        c_totalnum='int',
        c_basecode='varchar(50)',
        c_dimcode='varchar(700)',
        c_tooltip='varchar(900)',
        update_date='date',
        sourcesystem_cd='varchar(50)',
    )
    coltypes = ','.join(f'{name} {ty}' for (name, ty) in col_to_type.items())

    @property
    def version_name(self) -> str:
        """version info that fits in an i2b2 name (50 characters)
        """
        task_hash = self.task_id.split('_')[
            -1]  # hmm... luigi doesn't export this
        return f'v{self.naaccr_version}-{task_hash}'

    @property
    def task_hash(self) -> str:
        return self.task_id.split('_')[-1]  # hmm... luigi doesn't export this

    def output(self) -> JDBCTableTarget:
        query = fr"""
          (select 1 from {self.table_name}
           where c_fullname = '{tr_ont.NAACCR_I2B2.top_folder}'
           and c_basecode = '{self.version_name}')
        """
        return JDBCTableTarget(self, query)

    @property
    def classpath(self) -> str:
        return self.jdbc_driver_jar

    @property
    def __password(self) -> str:
        from os import environ  # ISSUE: ambient
        return environ[self.passkey]

    def account(self) -> td.Account:
        from subprocess import Popen  # ISSUE: AMBIENT
        return td.Account('DEID',
                          self.user,
                          self.__password,
                          Popen,
                          url=self.db_url,
                          driver=self.driver)

    def run(self) -> None:
        conn = connect_mem(':memory:', detect_types=PARSE_COLNAMES)
        spark = DBSession(conn)
        update_date = dt.datetime.strptime(self.z_design_id[:10],
                                           '%Y-%m-%d').date()
        terms = tr_ont.NAACCR_I2B2.ont_view_in(spark,
                                               who_cache=self.who_cache,
                                               task_hash=self.task_hash,
                                               update_date=update_date)
        cdw = self.account()
        cdw.wr(self.table_name, td.case_fold(terms))