class EmotionDetector:
    def __init__(self,
                 model='VGG19',
                 main_dir=main_dir_path,
                 face_detector='undefined',
                 use_cuda=False,
                 reliability=0.8):
        self.main_dir = main_dir
        self.face_detector = face_detector
        self.use_cuda = use_cuda
        self.reliability = reliability
        self.cut_size = 44

        self.transform_test = transforms.Compose([
            transforms.TenCrop(self.cut_size),
            transforms.Lambda(lambda crops: torch.stack(
                [transforms.ToTensor()(crop) for crop in crops])),
        ])

        self.class_names = [
            'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'
        ]

        if model == 'VGG19':
            self.net = VGG('VGG19')
        elif model == 'Resnet18':
            self.net = ResNet18()
        self.checkpoint = torch.load(os.path.join(
            self.main_dir + 'pretrained_model/' + model,
            'PrivateTest_model.t7'),
                                     map_location='cpu')
        self.net.load_state_dict(self.checkpoint['net'])
        if self.use_cuda:
            self.net.cuda()
        self.net.eval()

    def rgb2gray(self, rgb):
        return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])

    def detect_emotion_single_face(self, raw_img):
        '''
            This function is used to dectect facial emotion for an image of single face
        '''
        gray = self.rgb2gray(raw_img)
        gray = resize(gray, (48, 48), mode='symmetric').astype(np.uint8)

        img = gray[:, :, np.newaxis]
        img = np.concatenate((img, img, img), axis=2)
        img = Image.fromarray(img)
        inputs = self.transform_test(img)

        ncrops, c, h, w = np.shape(inputs)
        inputs = inputs.view(-1, c, h, w)
        if self.use_cuda:
            inputs = inputs.cuda()
        inputs = Variable(inputs, volatile=True)
        outputs = self.net(inputs)

        outputs_avg = outputs.view(ncrops, -1).mean(0)  # avg over crops

        score = F.softmax(outputs_avg)
        _, predicted = torch.max(outputs_avg.data, 0)
        if torch.max(score) > self.reliability:
            #return score, predicted
            return score, self.class_names[int(predicted.cpu().numpy())]
        else:
            return score, 'UNK'

    def detect_emotion_multiple_face(self, raw_img):
        '''
            This function is used to dectect facial emotion for an image with multiple faces
        '''

        if isinstance(self.face_detector, MTCNN):
            bounding_boxes, _, _ = self.face_detector.align(raw_img)
        else:
            print(
                'No MTCNN face dectector found.'
            )  #TODO: change to add more facedetection model to do experiments)

        scores = []
        predicteds = []
        for facebox in bounding_boxes:
            face_img = raw_img[int(facebox[1]):int(facebox[3]),
                               int(facebox[0]):int(facebox[2])]

            gray = self.rgb2gray(face_img)
            gray = resize(gray, (48, 48), mode='symmetric').astype(np.uint8)

            img = gray[:, :, np.newaxis]
            img = np.concatenate((img, img, img), axis=2)
            img = Image.fromarray(img)
            inputs = self.transform_test(img)

            ncrops, c, h, w = np.shape(inputs)
            inputs = inputs.view(-1, c, h, w)
            if self.use_cuda:
                inputs = inputs.cuda()
            inputs = Variable(inputs, volatile=True)
            outputs = self.net(inputs)

            outputs_avg = outputs.view(ncrops, -1).mean(0)  # avg over crops

            score = F.softmax(outputs_avg)
            _, predicted = torch.max(outputs_avg.data, 0)

            scores.append(score)
            #predicteds.append(predicted)
            if torch.max(score) > self.reliability:
                predicteds.append(self.class_names[int(
                    predicted.cpu().numpy())])
            else:
                predicteds.append('UNK')

        return bounding_boxes, scores, predicteds

    def detect_emotion_from_faceboxes(self, faceboxes):
        '''
            
        '''
        scores = []
        predicteds = []
        for facebox in faceboxes:
            gray = self.rgb2gray(face_img)
            gray = resize(gray, (48, 48), mode='symmetric').astype(np.uint8)

            img = gray[:, :, np.newaxis]
            img = np.concatenate((img, img, img), axis=2)
            img = Image.fromarray(img)
            inputs = self.transform_test(img)

            ncrops, c, h, w = np.shape(inputs)
            inputs = inputs.view(-1, c, h, w)
            if self.use_cuda:
                inputs = inputs.cuda()
            inputs = Variable(inputs, volatile=True)
            outputs = self.net(inputs)

            outputs_avg = outputs.view(ncrops, -1).mean(0)  # avg over crops

            score = F.softmax(outputs_avg)
            _, predicted = torch.max(outputs_avg.data, 0)

            scores.append(score)
            #predicteds.append(predicted)
            if torch.max(score) > self.reliability:
                predicteds.append(self.class_names[int(
                    predicted.cpu().numpy())])
            else:
                predicteds.append('UNK')

        return scores, predicteds
class CNNDetector(object):
    def __init__(self,
                 net_12_param_path=None,
                 net_48_param_path=None,
                 net_vgg_param_path=None,
                 use_cuda=True,
                 pthreshold=0.7,
                 rthershold=0.9):
        if use_cuda == False:
            self.device = torch.device('cpu')
        else:
            self.device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu')

        if net_12_param_path is not None:
            self.net_12 = Net12()
            self.net_12.load_state_dict(
                torch.load(net_12_param_path,
                           map_location=lambda storage, loc: storage))
            self.net_12.to(self.device)
            self.net_12.eval()
        if net_48_param_path is not None:
            self.net_48 = Net48()
            self.net_48.load_state_dict(
                torch.load(net_48_param_path,
                           map_location=lambda storage, loc: storage))
            self.net_48.to(self.device)
            self.net_48.eval()
        if net_vgg_param_path is not None:
            self.net_vgg = VGG('VGG19')
            self.net_vgg.load_state_dict(
                torch.load(net_vgg_param_path,
                           map_location=lambda storage, loc: storage))
            self.net_vgg.to(self.device)
            self.net_vgg.eval()
        self.pthreshold = pthreshold
        self.rthershold = rthershold

    def generate_stage(self, img):
        """
        Args:
            img: source image
        Rets:
            bounding boxes, numpy array, n x 5

        Generate face bounding box proposals using net-12.
        """
        proposals = list()
        downscaling_factor = 0.7
        current_height, current_width, _ = img.shape
        current_scale = 1.0
        # limit maximum height to 500
        if current_height > 500:
            current_scale = 500.0 / current_height

        receptive_field = 12
        stride = 2
        while True:
            # get the resized image at current scale
            im_resized = imageproc.resize_image(img, current_scale)
            current_height, current_width, _ = im_resized.shape
            if min(current_height, current_width
                   ) <= receptive_field:  # receptive field of the net-12
                break
            # transpose hwc (Numpy) to chw (Tensor)
            feed_imgs = (
                transforms.ToTensor()(im_resized)).unsqueeze(0).float()
            # feed to net-12
            with torch.no_grad():
                feed_imgs = feed_imgs.to(self.device)
                bbox_class, bbox_regress = self.net_12(feed_imgs)

                bbox_class = bbox_class.cpu().squeeze(0).detach().numpy()
                bbox_regress = bbox_regress.cpu().squeeze(0).detach().numpy()

            # FILTER classes with threshold
            up_thresh_masked_index = np.where(
                bbox_class > self.pthreshold)  # threshold
            up_thresh_masked_index = up_thresh_masked_index[1:3]
            filtered_results = np.vstack([
                # pixel coordinate for receptive window
                np.round((stride * up_thresh_masked_index[1]) / current_scale),
                np.round((stride * up_thresh_masked_index[0]) / current_scale),
                np.round(
                    (stride * up_thresh_masked_index[1] + receptive_field) /
                    current_scale),
                np.round(
                    (stride * up_thresh_masked_index[0] + receptive_field) /
                    current_scale),
                # original bbox output form network
                bbox_class[0, up_thresh_masked_index[0],
                           up_thresh_masked_index[1]],
                bbox_regress[:, up_thresh_masked_index[0],
                             up_thresh_masked_index[1]],
            ]).T
            keep_mask = imageproc.neighbour_supression(filtered_results[:, :5],
                                                       0.7, 'Union')
            filtered_results = filtered_results[keep_mask]
            current_scale *= downscaling_factor
            proposals.append(filtered_results)
        # aggregate proposals from list
        proposals = np.vstack(proposals)
        keep_mask = imageproc.neighbour_supression(proposals[:, 0:5], 0.5,
                                                   'Union')
        proposals = proposals[keep_mask]
        if len(proposals) == 0:
            # no proposal generated
            return None
        # convert multi-sacle bbox to unified bbox at original img scale
        receptive_window_width_pixels = proposals[:, 2] - proposals[:, 0] + 1
        receptive_window_height_pixels = proposals[:, 3] - proposals[:, 1] + 1
        bbox_aligned = np.vstack([
            proposals[:, 0] + proposals[:, 5] *
            receptive_window_width_pixels,  # upleft_x
            proposals[:, 1] + proposals[:, 6] * \
            receptive_window_height_pixels,  # upleft_y
            proposals[:, 2] + proposals[:, 7] * \
            receptive_window_width_pixels,  # downright_x
            proposals[:, 3] + proposals[:, 8] * \
            receptive_window_height_pixels,  # downright_y
            proposals[:, 4],  # classes
        ])
        bbox_aligned = bbox_aligned.T

        return bbox_aligned

    def refine_stage(self, img, proposal_bbox):
        """
        Args:
            img: source image
            proposal_bbox: bounding box proposals from generate stage 
        Rets:
            bounding boxes, numpy array, n x 5

        Apply delta corrdinate to bboxes using net-48.
        """
        if proposal_bbox is None:
            return None, None

        proposal_bbox = imageproc.convert_to_square(proposal_bbox)

        cropped_tmp_tensors = imageproc.bbox_crop(img, proposal_bbox)
        # feed to net-48
        with torch.no_grad():
            feed_imgs = Variable(torch.stack(cropped_tmp_tensors))

            feed_imgs = feed_imgs.to(self.device)

            bbox_class, bbox_regress, landmark = self.net_48(feed_imgs)

            bbox_class = bbox_class.cpu().detach().numpy()
            bbox_regress = bbox_regress.cpu().detach().numpy()
            landmark = landmark.cpu().detach().numpy()
        # threshold
        up_thresh_masked_index = np.where(bbox_class > self.rthershold)[0]
        boxes = proposal_bbox[up_thresh_masked_index]
        bbox_class = bbox_class[up_thresh_masked_index]
        bbox_regress = bbox_regress[up_thresh_masked_index]
        landmark = landmark[up_thresh_masked_index]
        # aggregate
        keep_mask = imageproc.neighbour_supression(boxes, 0.5, mode="Minimum")

        if len(keep_mask) == 0:
            return None, None

        proposals = boxes[keep_mask]
        bbox_class = bbox_class[keep_mask]
        bbox_regress = bbox_regress[keep_mask]
        landmark = landmark[keep_mask]

        receptive_window_width_pixels = proposals[:, 2] - proposals[:, 0] + 1
        receptive_window_height_pixels = proposals[:, 3] - proposals[:, 1] + 1
        # get new bounding boxes
        boxes_align = np.vstack([
            proposals[:, 0] +
            bbox_regress[:, 0] * receptive_window_width_pixels,  # upleft_x
            proposals[:, 1] +
            bbox_regress[:, 1] * receptive_window_height_pixels,  # upleft_y
            proposals[:, 2] +
            bbox_regress[:, 2] * receptive_window_width_pixels,  # downright_x
            proposals[:, 3] +
            bbox_regress[:, 3] * receptive_window_height_pixels,  # downright_y
            bbox_class[:, 0],
        ]).T
        # get facial landmarks
        align_landmark_topx = proposals[:, 0]
        align_landmark_topy = proposals[:, 1]
        landmark_align = np.vstack([
            align_landmark_topx +
            landmark[:, 0] * receptive_window_width_pixels,  # lefteye_x
            align_landmark_topy +
            landmark[:, 1] * receptive_window_height_pixels,  # lefteye_y
            align_landmark_topx +
            landmark[:, 2] * receptive_window_width_pixels,  # righteye_x
            align_landmark_topy +
            landmark[:, 3] * receptive_window_height_pixels,  # righteye_y
            align_landmark_topx +
            landmark[:, 4] * receptive_window_width_pixels,  # nose_x
            align_landmark_topy +
            landmark[:, 5] * receptive_window_height_pixels,  # nose_y
            align_landmark_topx +
            landmark[:, 6] * receptive_window_width_pixels,  # leftmouth_x
            align_landmark_topy +
            landmark[:, 7] * receptive_window_height_pixels,  # leftmouth_y
            align_landmark_topx +
            landmark[:, 8] * receptive_window_width_pixels,  # rightmouth_x
            align_landmark_topy +
            landmark[:, 9] * receptive_window_height_pixels,  # rightmouth_y
        ]).T

        return boxes_align, landmark_align

    def detect_face(self, img, atleastone=True):
        """
        Args:
            img: source image
            atleastone: whether the size of image should be retured when no face is found
        Rets:
            bounding boxes, numpy array
            landmark, numpy array

        Detect faces in the image. 
        """
        if self.net_12:
            boxes_align = self.generate_stage(img)
        if self.net_48:
            boxes_align, landmark_align = self.refine_stage(img, boxes_align)
        if boxes_align is None:
            if atleastone:
                boxes_align = np.array([[0, 0, img.shape[1], img.shape[0]]])
            else:
                boxes_align = np.array([])
        if landmark_align is None:
            landmark_align = np.array([])
        return boxes_align, landmark_align

    def crop_faces(self, img, bbox=None):
        """
        see imageproc.bbox_crop
        """
        return imageproc.bbox_crop(img, bbox, totensor=False)

    def vgg_net(self, img):
        """
        Args:
            img: source image
        Rets:
            prob of each expression: in order of 
            ['Angry', 'Disgust', 'Fear',
                       'Happy', 'Sad', 'Surprise', 'Neutral'] 

        Detect facial expression in the image. 
        """

        grey_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        grey_img = cv2.resize(grey_img, (48, 48)).astype(np.uint8)

        grey_img = grey_img[:, :, np.newaxis]
        grey_img = np.concatenate((grey_img, grey_img, grey_img), axis=2)
        receptive_field = 44
        # get ten crops at the corners and center
        tencrops = transforms.Compose([
            transforms.ToPILImage(),
            transforms.TenCrop(receptive_field),
            transforms.Lambda(lambda crops: torch.stack(
                [transforms.ToTensor()(crop) for crop in crops])),
        ])

        inputs = tencrops(grey_img)

        ncrops, c, h, w = np.shape(inputs)
        # feed to VGG net
        with torch.no_grad():
            inputs = inputs.view(-1, c, h, w)
            inputs = inputs.to(self.device)
            outputs = self.net_vgg(inputs)
            # get mean value across all the crops
            outputs_avg = outputs.view(ncrops, -1).mean(0)
            probabilities = F.softmax(outputs_avg, dim=0)
            # max prob as the detection resutlt
            _, predicted_class = torch.max(outputs_avg.data, 0)
            probabilities = probabilities.cpu().numpy()
            predicted_class = int(predicted_class.cpu().numpy())
        return probabilities, predicted_class
예제 #3
0
class DPP(object):
    def __init__(self, args):
        self.criterion = nn.CrossEntropyLoss().cuda()
        self.lr = args.lr
        self.epochs = args.epochs
        self.save_dir = './' + args.save_dir  #later change
        if (os.path.exists(self.save_dir) == False):
            os.mkdir(self.save_dir)

        if (args.model == 'vgg16'):
            self.model = VGG('VGG16', 0)
            self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                                    self.model.parameters()),
                                             lr=self.lr,
                                             momentum=args.momentum,
                                             weight_decay=args.weight_decay)
            self.model = torch.nn.DataParallel(self.model)
            self.model.cuda()
        elif (args.model == 'dpp_vgg16'):
            self.model = integrated_kernel(args)
            self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                                    self.model.parameters()),
                                             lr=self.lr,
                                             momentum=args.momentum,
                                             weight_decay=args.weight_decay)

        #Parallel
        num_params = sum(p.numel() for p in self.model.parameters()
                         if p.requires_grad)
        print('The number of parametrs of models is', num_params)

        if (args.save_load):
            location = args.save_location
            print("locaton", location)
            checkpoint = torch.load(location)
            self.model.load_state_dict(checkpoint['state_dict'])

    def train(self, train_loader, test_loader, graph):
        #Declaration Model
        self.model.train()
        best_prec = 0
        losses = AverageMeter()
        top1 = AverageMeter()
        for epoch in range(self.epochs):
            #Test Accuarcy
            #self.adjust_learning_rate(epoch)
            for k, (inputs, target) in enumerate(train_loader):
                target = target.cuda(async=True)
                input_var = inputs.cuda()
                target_var = target
                output = self.model(input_var)
                loss = self.criterion(output, target_var)
                #Compute gradient and Do SGD step
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                #Measure accuracy and record loss
                prec1 = self.accuracy(output.data, target)[0]
                losses.update(loss.item(), inputs.size(0))
                top1.update(prec1.item(), inputs.size(0))

            graph.train_loss(losses.avg, epoch, 'train_loss')
            graph.train_acc(top1.avg, epoch, 'train_acc')
            prec = self.test(test_loader, epoch, graph)
            if (prec > best_prec):
                print("Acc", prec)
                best_prec = prec
                self.save_checkpoint(
                    {
                        'best_prec1': best_prec,
                        'state_dict': self.model.state_dict(),
                    },
                    filename=os.path.join(self.save_dir,
                                          'checkpoint_{}.tar'.format(epoch)))

    def test(self, test_loader, epoch, test_graph):
        self.model.eval()
        losses = AverageMeter()
        top1 = AverageMeter()
        for k, (inputs, target) in enumerate(test_loader):
            target = target.cuda()
            inputs = inputs.cuda()
            #Calculate each model
            #Compute gradient and Do SGD step
            output = self.model(inputs)
            loss = self.criterion(output, target)
            #Measure accuracy and record loss
            prec1 = self.accuracy(output.data, target)[0]
            losses.update(loss.item(), inputs.size(0))
            top1.update(prec1.item(), inputs.size(0))
        test_graph.test_loss(losses.avg, epoch, 'test_loss')
        test_graph.test_acc(top1.avg, epoch, 'test_acc')
        return top1.avg

    def accuracy(self, output, target, topk=(1, )):
        maxk = max(topk)
        batch_size = target.size(0)
        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

    def adjust_learning_rate(self, epoch):
        self.lr = self.lr * (0.1**(epoch // 90))
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.lr

    def save_checkpoint(self, state, filename='checkpoint.pth.tar'):
        torch.save(state, filename)