def get_inference_images(self,
                             project,
                             epoch=None,
                             numEpochs=None,
                             goldenQuestionsOnly=False,
                             forceUnlabeled=False,
                             maxNumImages=None,
                             numChunks=1):
        '''
                Queries the database for the latest images to be used for inference after model training.
                Returns a list with image UUIDs accordingly, split into the number of available workers.
            '''
        if maxNumImages is None or maxNumImages <= 0:
            queryResult = self.dbConn.execute(
                '''
                    SELECT maxNumImages_inference
                    FROM aide_admin.project
                    WHERE shortname = %s;''', (project, ), 1)
            maxNumImages = queryResult[0]['maxnumimages_inference']

        queryVals = (maxNumImages, )

        # load the IDs of the images that are being subjected to inference
        sql = self.sqlBuilder.getInferenceQueryString(project, forceUnlabeled,
                                                      goldenQuestionsOnly,
                                                      maxNumImages)
        imageIDs = self.dbConn.execute(sql, queryVals, 'all')
        imageIDs = [i['image'] for i in imageIDs]

        if numChunks > 1:
            imageIDs = array_split(imageIDs, max(1,
                                                 len(imageIDs) // numChunks))
        else:
            imageIDs = [imageIDs]
        return imageIDs
예제 #2
0
    def get_inference_images(self,
                             project,
                             epoch=None,
                             numEpochs=None,
                             goldenQuestionsOnly=False,
                             forceUnlabeled=False,
                             maxNumImages=None,
                             numChunks=1):
        '''
                Queries the database for the latest images to be used for inference after model training.
                Returns a list with image UUIDs accordingly, split into the number of available workers.
                #TODO: goldenQuestionsOnly
            '''
        if maxNumImages is None or maxNumImages <= 0:
            queryResult = self.dbConn.execute(
                '''
                    SELECT maxNumImages_inference
                    FROM aide_admin.project
                    WHERE shortname = %s;''', (project, ), 1)
            maxNumImages = queryResult[0]['maxnumimages_inference']

        queryVals = (maxNumImages, )

        # load the IDs of the images that are being subjected to inference
        sql = self.sqlBuilder.getInferenceQueryString(project, forceUnlabeled,
                                                      maxNumImages)
        imageIDs = self.dbConn.execute(sql, queryVals, 'all')
        imageIDs = [i['image'] for i in imageIDs]

        # # split for distribution across workers
        # if maxNumWorkers != 1:
        #     # only query the number of available workers if more than one is specified to save time
        #     num_available = self._get_num_available_workers()
        #     if maxNumWorkers == -1:
        #         maxNumWorkers = num_available   #TODO: more than one process per worker?
        #     else:
        #         maxNumWorkers = min(maxNumWorkers, num_available)

        if numChunks > 1:
            imageIDs = array_split(imageIDs, max(1,
                                                 len(imageIDs) // numChunks))
        else:
            imageIDs = [imageIDs]
        return imageIDs
    def _get_inference_job_signature(self, imageIDs, maxNumWorkers=-1):
        '''
            Assembles (but does not submit) an inference job based on the provided parameters.
        '''
        # setup
        if maxNumWorkers != 1:
            # only query the number of available workers if more than one is specified to save time
            num_available = self._get_num_available_workers()
            if maxNumWorkers == -1:
                maxNumWorkers = num_available  #TODO: more than one process per worker?
            else:
                maxNumWorkers = min(maxNumWorkers, num_available)

        # distribute across workers
        images_subset = array_split(imageIDs,
                                    max(1,
                                        len(imageIDs) // maxNumWorkers))
        jobs = []
        for subset in images_subset:
            job = celery_interface.call_inference.si(imageIDs=subset)
            jobs.append(job)

        jobGroup = group(jobs)
        return jobGroup
예제 #4
0
def _call_inference(project, imageIDs, epoch, numEpochs, modelInstance,
                    modelLibrary, rankerInstance, dbConnector, batchSizeLimit):
    '''

    '''
    print(
        f'[{project}] Epoch {epoch}: Initiated inference on {len(imageIDs)} images...'
    )
    update_state = __get_message_fun(project, len(imageIDs), 0, epoch,
                                     numEpochs)

    # get project's prediction type
    projectMeta = dbConnector.execute(
        sql.SQL('''
            SELECT predictionType
            FROM aide_admin.project
            WHERE shortname = %s;
        '''), (project, ), 1)
    predType = projectMeta[0]['predictiontype']

    # load model state
    update_state(state='PREPARING',
                 message=f'[Epoch {epoch}] loading model state')
    try:
        stateDict, stateDictID, modelOriginID, _ = __load_model_state(
            project, modelLibrary, dbConnector)
    except Exception as e:
        print(e)
        raise Exception(
            f'[Epoch {epoch}] error during model state loading (reason: {str(e)})'
        )

    # if batch size limit specified: split imageIDs into chunks and process in smaller batches
    if isinstance(batchSizeLimit, int) and batchSizeLimit > 0:
        imageID_chunks = array_split(imageIDs, batchSizeLimit)
    else:
        imageID_chunks = [imageIDs]

    # process in batches
    for idx, imageID_batch in enumerate(imageID_chunks):
        chunkStr = f'{idx+1}/{len(imageID_chunks)}'
        print(f'Chunk {chunkStr}')

        update_state = __get_message_fun(project, len(imageIDs),
                                         idx * batchSizeLimit, epoch,
                                         numEpochs)

        # load remaining data (image filenames, class definitions)
        update_state(
            state='PREPARING',
            message=f'[Epoch {epoch}] loading metadata (chunk {chunkStr})')
        try:
            data = __load_metadata(project, dbConnector, imageID_batch, False,
                                   modelOriginID)
        except Exception as e:
            print(e)
            raise Exception(
                f'[Epoch {epoch}] error during metadata loading (chunk {chunkStr})'
            )

        # call inference function
        update_state(
            state='PREPARING',
            message=f'[Epoch {epoch}] starting inference (chunk {chunkStr})')
        try:
            result = modelInstance.inference(stateDict=stateDict,
                                             data=data,
                                             updateStateFun=update_state)
        except Exception as e:
            print(e)
            raise Exception(
                f'[Epoch {epoch}] error during inference (chunk {chunkStr}; reason: {str(e)})'
            )

        # call ranking function (AL criterion)
        if rankerInstance is not None and hasattr(rankerInstance, 'rank'):
            update_state(
                state='PREPARING',
                message=
                f'[Epoch {epoch}] calculating priorities (chunk {chunkStr})')
            try:
                result = rankerInstance.rank(data=result,
                                             updateStateFun=update_state,
                                             **{'stateDict': stateDict})
            except Exception as e:
                print(e)
                raise Exception(
                    f'[Epoch {epoch}] error during ranking (chunk {chunkStr}, reason: {str(e)})'
                )

        # parse result
        try:
            update_state(
                state='FINALIZING',
                message=f'[Epoch {epoch}] saving predictions (chunk {chunkStr})'
            )
            fieldNames = list(getattr(FieldNames_prediction, predType).value)
            fieldNames.append('image')  # image ID
            fieldNames.append('cnnstate')  # model state ID
            values_pred = []
            values_img = []  # mostly for feature vectors
            for imgID in result.keys():
                for prediction in result[imgID]['predictions']:

                    # if segmentation mask: encode
                    if predType == 'segmentationMasks':
                        segMask = np.array(
                            result[imgID]['predictions'][0]['label']).astype(
                                np.uint8)
                        height, width = segMask.shape
                        segMask = base64.b64encode(
                            segMask.ravel()).decode('utf-8')
                        segMaskDimensions = {'width': width, 'height': height}

                    nextResultValues = []
                    # we expect a dict of values, so we can use the fieldNames directly
                    for fn in fieldNames:
                        if fn == 'image':
                            nextResultValues.append(imgID)
                            # ids_img.append(imgID)
                        elif fn == 'cnnstate':
                            nextResultValues.append(stateDictID)
                        elif fn == 'segmentationmask':
                            nextResultValues.append(segMask)
                        elif fn == 'width' or fn == 'height':
                            if predType == 'segmentationMasks':
                                nextResultValues.append(segMaskDimensions[fn])
                            elif fn in prediction:
                                nextResultValues.append(prediction[fn])
                            else:
                                nextResultValues.append(None)
                        elif fn == 'priority':
                            value = None
                            if fn in prediction and prediction[
                                    fn] is None and 'confidence' in prediction:
                                # ranker somehow didn't assign value; use confidence by default
                                value = prediction['confidence']
                            elif fn in prediction:
                                value = prediction[fn]
                            else:
                                #TODO: provide replacement for priority, e.g. in case of segmentation masks
                                value = None

                            if isinstance(value, list) or isinstance(
                                    value, np.ndarray):
                                # segmentation masks, etc.: array of values provided; take average instead
                                try:
                                    value = np.nanmean(np.array(value))
                                except:
                                    value = None
                            nextResultValues.append(value)
                        elif fn == 'confidence':
                            value = None
                            if fn in prediction:
                                value = prediction[fn]
                                if isinstance(value, list) or isinstance(
                                        value, np.ndarray):
                                    # segmentation masks, etc.: array of values provided; take average instead
                                    try:
                                        value = np.nanmean(np.array(value))
                                    except:
                                        value = None
                            nextResultValues.append(value)
                        else:
                            if fn in prediction:
                                #TODO: might need to do typecasts (e.g. UUID?)
                                nextResultValues.append(prediction[fn])

                            else:
                                # field name is not in return value; might need to raise a warning, Exception, or set to None
                                nextResultValues.append(None)

                    values_pred.append(tuple(nextResultValues))

                if 'fVec' in result[imgID] and len(result[imgID]['fVec']):
                    values_img.append((
                        imgID,
                        psycopg2.Binary(result[imgID]['fVec']),
                    ))
        except Exception as e:
            print(e)
            raise Exception(
                f'[Epoch {epoch}] error during result parsing (chunk {chunkStr}, reason: {str(e)})'
            )

        # commit to database
        try:
            if len(values_pred):
                queryStr = sql.SQL('''
                    INSERT INTO {id_pred} ( {fieldNames} )
                    VALUES %s;
                ''').format(id_pred=sql.Identifier(project, 'prediction'),
                            fieldNames=sql.SQL(',').join(
                                [sql.SQL(f) for f in fieldNames]))
                dbConnector.insert(queryStr, values_pred)

            if len(values_img):
                queryStr = sql.SQL('''
                    INSERT INTO {} ( id, fVec )
                    VALUES %s
                    ON CONFLICT (id) DO UPDATE SET fVec = EXCLUDED.fVec;
                ''').format(sql.Identifier(project, 'image'))
                dbConnector.insert(queryStr, values_img)
        except Exception as e:
            print(e)
            raise Exception(
                f'[Epoch {epoch}] error during data committing (chunk {chunkStr}, reason: {str(e)})'
            )

    update_state(state=states.SUCCESS,
                 message='predicted on {} images'.format(len(imageIDs)))

    print(f'[{project}] Epoch {epoch}: Inference completed successfully.')
    return
    def _get_training_job_signature(self,
                                    minTimestamp='lastState',
                                    minNumAnnoPerImage=0,
                                    maxNumImages=None,
                                    maxNumWorkers=-1):
        '''
            Assembles (but does not submit) a training job based on the provided parameters.
        '''
        # check if training is still in progress
        if self.messageProcessor.task_ongoing('train'):
            raise Exception('Training process already running.')

        self.training = True

        try:

            # sanity checks
            if not (isinstance(minTimestamp, datetime)
                    or minTimestamp == 'lastState' or minTimestamp == -1
                    or minTimestamp is None):
                raise ValueError(
                    '{} is not a recognized property for variable "minTimestamp"'
                    .format(str(minTimestamp)))

            if maxNumWorkers != 1:
                # only query the number of available workers if more than one is specified to save time
                num_workers = min(maxNumWorkers,
                                  self._get_num_available_workers())
            else:
                num_workers = maxNumWorkers

            # query image IDs
            sql = self.sqlBuilder.getLatestQueryString(
                minNumAnnoPerImage=minNumAnnoPerImage, limit=maxNumImages)

            if isinstance(minTimestamp, datetime):
                imageIDs = self.dbConn.execute(sql, (minTimestamp, ), 'all')
            else:
                imageIDs = self.dbConn.execute(sql, None, 'all')

            imageIDs = [i['image'] for i in imageIDs]

            if maxNumWorkers > 1:

                # distribute across workers (TODO: also specify subset size for multiple jobs; randomly draw if needed)
                images_subset = array_split(
                    imageIDs, max(1,
                                  len(imageIDs) // num_workers))

                processes = []
                for subset in images_subset:
                    processes.append(
                        celery_interface.call_train.si(subset, True))
                process = group(processes)

            else:
                # call one worker directly
                # process = celery_interface.call_train.delay(data) #TODO: route to specific worker? http://docs.celeryproject.org/en/latest/userguide/routing.html#manual-routing
                process = celery_interface.call_train.si(imageIDs, False)

            return process, num_workers

        except:
            self.training = self.messageProcessor.task_ongoing('train')
            return None
예제 #6
0
    def get_training_images(self,
                            project,
                            epoch=None,
                            numEpochs=None,
                            minTimestamp='lastState',
                            includeGoldenQuestions=True,
                            minNumAnnoPerImage=0,
                            maxNumImages=None,
                            numChunks=1):
        '''
            Queries the database for the latest images to be used for model training.
            Returns a list with image UUIDs accordingly, split into the number of
            available workers.
            #TODO: includeGoldenQuestions
        '''
        # sanity checks
        if not (isinstance(minTimestamp, datetime) or minTimestamp
                == 'lastState' or minTimestamp == -1 or minTimestamp is None):
            raise ValueError(
                '{} is not a recognized property for variable "minTimestamp"'.
                format(str(minTimestamp)))

        # query image IDs
        queryVals = []

        if minTimestamp is None:
            timestampStr = sql.SQL('')
        elif minTimestamp == 'lastState':
            timestampStr = sql.SQL('''
            WHERE iu.last_checked > COALESCE(to_timestamp(0),
            (SELECT MAX(timecreated) FROM {id_cnnstate}))''').format(
                id_cnnstate=sql.Identifier(project, 'cnnstate'))
        elif isinstance(minTimestamp, datetime):
            timestampStr = sql.SQL(
                'WHERE iu.last_checked > COALESCE(to_timestamp(0), %s)')
            queryVals.append(minTimestamp)
        elif isinstance(minTimestamp, int) or isinstance(minTimestamp, float):
            timestampStr = sql.SQL(
                'WHERE iu.last_checked > COALESCE(to_timestamp(0), to_timestamp(%s))'
            )
            queryVals.append(minTimestamp)

        if minNumAnnoPerImage > 0:
            queryVals.append(minNumAnnoPerImage)

        if maxNumImages is None or maxNumImages <= 0:
            limitStr = sql.SQL('')
        else:
            limitStr = sql.SQL('LIMIT %s')
            queryVals.append(maxNumImages)

        if minNumAnnoPerImage <= 0:
            queryStr = sql.SQL('''
                SELECT newestAnno.image FROM (
                    SELECT image, last_checked FROM {id_iu} AS iu
                    JOIN (
                        SELECT id AS iid
                        FROM {id_img}
                        WHERE corrupt IS NULL OR corrupt = FALSE
                    ) AS imgQ
                    ON iu.image = imgQ.iid
                    {timestampStr}
                    ORDER BY iu.last_checked ASC
                    {limitStr}
                ) AS newestAnno;
            ''').format(id_iu=sql.Identifier(project, 'image_user'),
                        id_img=sql.Identifier(project, 'image'),
                        timestampStr=timestampStr,
                        limitStr=limitStr)

        else:
            queryStr = sql.SQL('''
                SELECT newestAnno.image FROM (
                    SELECT image, last_checked FROM {id_iu} AS iu
                    JOIN (
                        SELECT id AS iid
                        FROM {id_img}
                        WHERE corrupt IS NULL OR corrupt = FALSE
                    ) AS imgQ
                    ON iu.image = imgQ.iid
                    {timestampStr}
                    {conjunction} image IN (
                        SELECT image FROM (
                            SELECT image, COUNT(*) AS cnt
                            FROM {id_anno}
                            GROUP BY image
                            ) AS annoCount
                        WHERE annoCount.cnt >= %s
                    )
                    ORDER BY iu.last_checked ASC
                    {limitStr}
                ) AS newestAnno;
            ''').format(id_iu=sql.Identifier(project, 'image_user'),
                        id_img=sql.Identifier(project, 'image'),
                        id_anno=sql.Identifier(project, 'annotation'),
                        timestampStr=timestampStr,
                        conjunction=(sql.SQL('WHERE') if minTimestamp is None
                                     else sql.SQL('AND')),
                        limitStr=limitStr)

        imageIDs = self.dbConn.execute(queryStr, tuple(queryVals), 'all')
        imageIDs = [i['image'] for i in imageIDs]

        if numChunks > 1:
            # split for distribution across workers (TODO: also specify subset size for multiple jobs; randomly draw if needed)
            imageIDs = array_split(imageIDs, max(1,
                                                 len(imageIDs) // numChunks))
        else:
            imageIDs = [imageIDs]

        print("Assembled training images into {} chunks (length of first: {})".
              format(len(imageIDs), len(imageIDs[0])))
        return imageIDs