Exemplo n.º 1
0
    def __init__(self, num_classes=5, pretrained=True):
        super().__init__(num_classes=2, sequence_length=8, contains_dropout=False)
        self.r2plus1 = r2plus1d_18(pretrained=True)
        self.r2plus1.fc = nn.Identity()
        self._set_requires_grad_for_module(self.r2plus1, requires_grad=False)

        self.sync_net = PretrainedSyncNet()
        # self._set_requires_grad_for_module(self.sync_net, requires_grad=False)

        self.relu = nn.ReLU()

        self.out = nn.Sequential(
            nn.Linear(512 + 1024, 50), nn.ReLU(), nn.Linear(50, self.num_classes)
        )
        self._init = False
Exemplo n.º 2
0
    def __init__(self, num_classes, sequence_length=8, pretrained=True):
        super().__init__(num_classes=2,
                         sequence_length=sequence_length,
                         contains_dropout=False)

        self.r2plus1 = SmallVideoNetworkPooledEmbedding(pretrained=pretrained)

        self.sync_net = PretrainedSyncNet()
        self._set_requires_grad_for_module(self.sync_net, requires_grad=False)
        # use standard audio_extractor with 1024 -> then max pooling? or average?
        # to lets say 8 values
        self.audio_pooling = nn.Sequential(
            SqueezeModule(1, squeeze=False, dim=1),
            nn.MaxPool1d(128, 128),
            SqueezeModule(1, squeeze=True, dim=1),
        )

        self.out = nn.Sequential(nn.Linear(16, 50), nn.BatchNorm1d(50),
                                 nn.LeakyReLU(0.2), nn.Linear(50, 2))
Exemplo n.º 3
0
    def __init__(self, num_classes=5, pretrained=True):
        super().__init__(
            num_classes=num_classes, sequence_length=8, contains_dropout=False
        )
        self.r2plus1 = torch.hub.load(
            "moabitcoin/ig65m-pytorch",
            "r2plus1d_34_8_kinetics",
            num_classes=400,
            pretrained=True,
        )
        self.r2plus1.layer3 = nn.Identity()
        self.r2plus1.layer4 = nn.Identity()
        self.r2plus1.fc = nn.Identity()

        self.sync_net = PretrainedSyncNet()
        self._set_requires_grad_for_module(self.sync_net, requires_grad=False)

        self.relu = nn.ReLU()
        self.out = nn.Sequential(
            nn.Linear(128 + 1024, 50), nn.ReLU(), nn.Linear(50, self.num_classes)
        )
Exemplo n.º 4
0
class NoisySyncAudioNet(BinaryEvaluationMixin, SequenceClassificationModel):
    def __init__(self, num_classes, pretrained=True):
        super().__init__(num_classes=2,
                         sequence_length=8,
                         contains_dropout=False)

        self.r2plus1 = self.r2plus1 = r2plus1d_18(pretrained=pretrained)
        self.r2plus1.layer2 = nn.Identity()
        self.r2plus1.layer3 = nn.Identity()
        self.r2plus1.layer4 = nn.Identity()
        self.r2plus1.fc = nn.Identity()

        self.sync_net = PretrainedSyncNet()
        # self._set_requires_grad_for_module(self.sync_net, requires_grad=False)

        self.relu = nn.ReLU()
        self.out = nn.Sequential(nn.Linear(64 + 1024, 50), nn.ReLU(),
                                 nn.Linear(50, self.num_classes))
        self._init = False

    def forward(self, x):
        # def forward(self, video, audio):
        video, audio = x  # bs x 8 x 3 x 112 x 112 , bs x 8 x 29
        # video = x  # bs x 8 x 3 x 112 x 112 , bs x 8 x 29

        video = video.transpose(1, 2)
        video = self.r2plus1(video)

        # syncnet only uses 5 frames
        audio = audio[:, 2:-1]
        audio = (audio.reshape(
            (audio.shape[0], -1, 13)).unsqueeze(1)).transpose(-2, -1)
        audio = self.sync_net.audio_extractor(audio)

        flat = torch.cat((video, audio), dim=1)
        out = self.out(self.relu(flat))
        return out

    def training_step(self, batch, batch_nb, system):
        x, (target, aud_noisy) = batch
        return super().training_step((x, aud_noisy), batch_nb, system)

    def aggregate_outputs(self, outputs, system):
        if not self._init:
            self._init = True
            system.file_list.class_to_idx = {"fake": 0, "youtube": 1}
            system.file_list.classes = ["fake", "youtube"]
        for x in outputs:
            x["target"] = x["target"][1]
        return super().aggregate_outputs(outputs, system)
Exemplo n.º 5
0
class SyncAudioNet(SequenceClassificationModel):
    def __init__(self, num_classes=5, pretrained=True):
        super().__init__(
            num_classes=num_classes, sequence_length=8, contains_dropout=False
        )
        self.r2plus1 = torch.hub.load(
            "moabitcoin/ig65m-pytorch",
            "r2plus1d_34_8_kinetics",
            num_classes=400,
            pretrained=True,
        )
        self.r2plus1.layer3 = nn.Identity()
        self.r2plus1.layer4 = nn.Identity()
        self.r2plus1.fc = nn.Identity()

        self.sync_net = PretrainedSyncNet()
        self._set_requires_grad_for_module(self.sync_net, requires_grad=False)

        self.relu = nn.ReLU()
        self.out = nn.Sequential(
            nn.Linear(128 + 1024, 50), nn.ReLU(), nn.Linear(50, self.num_classes)
        )

    def forward(self, x):
        # def forward(self, video, audio):
        video, audio = x  # bs x 8 x 3 x 112 x 112 , bs x 8 x 29
        # video = x  # bs x 8 x 3 x 112 x 112 , bs x 8 x 29

        video = video.transpose(1, 2)
        video = self.r2plus1(video)

        # syncnet only uses 5 frames
        audio = audio[:, 2:-1]
        audio = (audio.reshape((audio.shape[0], -1, 13)).unsqueeze(1)).transpose(-2, -1)
        audio = self.sync_net.audio_extractor(audio)

        flat = torch.cat((video, audio), dim=1)
        out = self.out(self.relu(flat))
        return out

    def training_step(self, batch, batch_nb, system):
        x, (target, _) = batch
        return super().training_step((x, target), batch_nb, system)

    def aggregate_outputs(self, outputs, system):
        for x in outputs:
            x["target"] = x["target"][0]
        return super().aggregate_outputs(outputs, system)
Exemplo n.º 6
0
class SmallEmbeddingSpace(BinaryEvaluationMixin, SequenceClassificationModel):
    def __init__(self, num_classes, sequence_length=8, pretrained=True):
        super().__init__(num_classes=2,
                         sequence_length=sequence_length,
                         contains_dropout=False)

        self.r2plus1 = SmallVideoNetworkPooledEmbedding(pretrained=pretrained)

        self.sync_net = PretrainedSyncNet()
        self._set_requires_grad_for_module(self.sync_net, requires_grad=False)
        # use standard audio_extractor with 1024 -> then max pooling? or average?
        # to lets say 8 values
        self.audio_pooling = nn.Sequential(
            SqueezeModule(1, squeeze=False, dim=1),
            nn.MaxPool1d(128, 128),
            SqueezeModule(1, squeeze=True, dim=1),
        )

        self.out = nn.Sequential(nn.Linear(16, 50), nn.BatchNorm1d(50),
                                 nn.LeakyReLU(0.2), nn.Linear(50, 2))

    def forward(self, x):
        video, audio = x  # bs x 8 x 3 x 112 x 112 , bs x 8 x 4 x 13
        # syncnet only uses 5 frames
        audio = audio[:, 2:-1]
        audio = (audio.reshape(
            (audio.shape[0], -1, 13)).unsqueeze(1)).transpose(-2, -1)

        audio = self.sync_net.audio_extractor(audio)  # bs x 1024
        audio = self.audio_pooling(audio)  # bs x 8

        video = self.r2plus1(video)  # bs x 8
        embedding = torch.cat((video, audio), dim=1)
        return self.out(embedding)

    def training_step(self, batch, batch_nb, system):
        x, target = batch
        return super().training_step((x, target[0] // 4), batch_nb, system)

    def aggregate_outputs(self, outputs, system):
        for output in outputs:
            output["target"] = output["target"][0] // 4
        return super().aggregate_outputs(outputs, system)
Exemplo n.º 7
0
    def __init__(self, num_classes=5, pretrained=True):
        super().__init__(num_classes=num_classes,
                         sequence_length=8,
                         contains_dropout=False)
        self.r2plus1 = r2plus1d_18(pretrained=True)

        self.r2plus1.layer3 = nn.Identity()
        self.r2plus1.layer4 = nn.Identity()
        self.r2plus1.fc = nn.Identity()

        self.sync_net = PretrainedSyncNet()
        self._set_requires_grad_for_module(self.sync_net, requires_grad=False)

        self.relu = nn.ReLU()

        self.padding = nn.ReflectionPad2d((0, 1, 0, 0))
        self.upsample = nn.Upsample(size=(8, 56, 56))

        self.merge_conv: nn.Module = nn.Sequential(
            Conv2Plus1D(128, 64, 144, 1), nn.BatchNorm3d(64),
            nn.ReLU(inplace=True))

        self.out = nn.Sequential(nn.Linear(128, 50), nn.ReLU(),
                                 nn.Linear(50, self.num_classes))
Exemplo n.º 8
0
    def __init__(self, num_classes=5, sequence_length=8, pretrained=True):
        super().__init__(
            num_classes=num_classes,
            sequence_length=sequence_length,
            contains_dropout=False,
        )
        self.r2plus1 = r2plus1d_18(pretrained=pretrained)
        self.r2plus1.layer2 = nn.Identity()
        self.r2plus1.layer3 = nn.Identity()
        self.r2plus1.layer4 = nn.Identity()
        self.r2plus1.fc = nn.Identity()

        self.video_mlp = nn.Sequential(
            nn.Linear(64, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Linear(512, 1024)
        )

        self.sync_net = PretrainedSyncNet()
        self._set_requires_grad_for_module(self.sync_net, requires_grad=False)

        self.audio_extractor = self.sync_net.audio_extractor

        self.c_loss = ContrastiveLoss(20)

        self.log_class_loss = False
# flake8: noqa
#%%
from importlib import reload

import forgery_detection.data.file_lists
from forgery_detection.data import set

reload(set)
from forgery_detection.data.utils import resized_crop
from forgery_detection.models.audio.similarity_stuff import (
    PretrainedSimilarityNet,
    PretrainedSyncNet,
)

p = PretrainedSyncNet().eval()  # .to("cuda:2")
p._shuffle_audio = lambda x: x
# f = FileList.load("/data/ssd1/file_lists/c40/tracked_resampled_faces.json")
f = forgery_detection.data.file_lists.FileList.load(
    "/data/ssd1/file_lists/c40/tracked_resampled_faces_yt_only_112_16_sequence_length.json"
)

#%%
import torch
from forgery_detection.data import loading

reload(loading)
from torchvision import transforms

d = f.get_dataset(
    "test",
    sequence_length=5,
Exemplo n.º 10
0
class SyncAudioNetRegularized(SequenceClassificationModel):
    def __init__(self, num_classes=5, pretrained=True):
        super().__init__(num_classes=num_classes,
                         sequence_length=8,
                         contains_dropout=False)
        self.r2plus1 = r2plus1d_18(pretrained=pretrained)

        self.r2plus1.layer2 = nn.Identity()
        self.r2plus1.layer3 = nn.Identity()
        self.r2plus1.layer4 = nn.Identity()
        self.r2plus1.fc = nn.Identity()

        self.sync_net = PretrainedSyncNet()
        self._set_requires_grad_for_module(self.sync_net, requires_grad=False)

        self.relu = nn.ReLU()
        self.out = nn.Sequential(nn.Linear(64 + 1024, 50), nn.ReLU(),
                                 nn.Linear(50, self.num_classes))

    def forward(self, x):
        # def forward(self, video, audio):
        video, audio = x  # bs x 8 x 3 x 112 x 112 , bs x 8 x 29
        # video = x  # bs x 8 x 3 x 112 x 112 , bs x 8 x 29

        video = video.transpose(1, 2)
        video = self.r2plus1(video)

        # syncnet only uses 5 frames
        audio = audio[:, 2:-1]
        audio = (audio.reshape(
            (audio.shape[0], -1, 13)).unsqueeze(1)).transpose(-2, -1)
        audio = self.sync_net.audio_extractor(audio)

        flat = torch.cat((video, audio), dim=1)
        out = self.out(self.relu(flat))
        return out, (video, audio)

    def weight_loss(self):
        vid_weights = self.out[0].weight[:, :64].std()
        aud_weights = self.out[0].weight[:, 64:].std()
        return torch.norm(vid_weights - aud_weights, 2) * 1e3

    def training_step(self, batch, batch_nb, system):
        x, (target, _) = batch

        pred, embeddings = self.forward(x)
        classification_loss = self.loss(pred, target)
        weight_loss = self.weight_loss()
        lightning_log = {"loss": classification_loss + weight_loss}

        with torch.no_grad():
            train_acc = self.calculate_accuracy(pred, target)
            tensorboard_log = {
                "loss": {
                    "train": classification_loss + weight_loss
                },
                "classification_loss": classification_loss,
                "weight_loss": weight_loss,
                "acc": {
                    "train": train_acc
                },
                "vid_std": torch.std(self.out[0].weight[:, :64]),
                "aud_std": torch.std(self.out[0].weight[:, 64:]),
            }

        return tensorboard_log, lightning_log

    def aggregate_outputs(self, outputs, system):
        if len(system.val_dataloader()) > 1:
            outputs = outputs[0]

        with torch.no_grad():
            pred = torch.cat([x["pred"][0] for x in outputs], 0)
            target = torch.cat([x["target"][0] for x in outputs], 0)

            loss_mean_classification = self.loss(pred, target)
            pred = pred.cpu()
            target = target.cpu()
            pred = F.softmax(pred, dim=1)
            acc_mean = self.calculate_accuracy(pred, target)

            # confusion matrix
            class_accuracies = system.log_confusion_matrix(target, pred)

            weight_loss = self.weight_loss()

            tensorboard_log = {
                "loss": loss_mean_classification + weight_loss,
                "acc": acc_mean,
                "class_acc": class_accuracies,
                "classification_loss": loss_mean_classification,
                "weight_loss": weight_loss,
                "vid_std": torch.std(self.out[0].weight[:, :64]),
                "aud_std": torch.std(self.out[0].weight[:, 64:]),
            }
        # if system.global_step > 0:
        self.log_class_loss = True

        return tensorboard_log, {}