def __init__(self, rgb_size=RGB_FEATURES_SIZE, audio_size=AUDIO_FEATURES_SIZE, output_size=YOUTUBE8M_LABELS_N, inner_sizes=(2048, 2048)): super().__init__() self._impl = FCN(rgb_size + audio_size, output_size, inner_sizes, dropout=0.2, out_activation=nn.Sigmoid)
def __init__(self, rgb_size=RGB_FEATURES_SIZE, audio_size=AUDIO_FEATURES_SIZE, output_size=YOUTUBE8M_LABELS_N, inner_size=2048, layers_number=2): super().__init__() self._layers = nn.ModuleList([ nn.GRU(input_size=rgb_size + audio_size if i == 0 else inner_size, hidden_size=inner_size, num_layers=1, batch_first=True, bidirectional=False) for i in range(layers_number) ]) self._out = FCN(inner_size * layers_number, output_size, (1524, 1524)) self._num_layers = layers_number self._inner_size = inner_size
def __init__(self, rgb_size=RGB_FEATURES_SIZE, audio_size=AUDIO_FEATURES_SIZE, output_size=YOUTUBE8M_LABELS_N, layers_number=3, hidden_size=1024): super().__init__() self._linear_layers = nn.ModuleList([ nn.Linear(rgb_size + audio_size + hidden_size * i, hidden_size) for i in range(layers_number) ]) self._attention_layers = nn.ModuleList( [nn.Linear(hidden_size, 1) for i in range(layers_number)]) self._bn = nn.BatchNorm1d(rgb_size + audio_size + hidden_size * layers_number) self._out = FCN(rgb_size + audio_size + hidden_size * layers_number, output_size, (4096, 4096)) self._num_layers = layers_number
def __init__(self, rgb_size=RGB_FEATURES_SIZE, audio_size=AUDIO_FEATURES_SIZE, output_size=YOUTUBE8M_LABELS_N, layers_number=4, hidden_size=768): super().__init__() self._linear_layers = nn.ModuleList([ nn.Linear(rgb_size + audio_size + hidden_size * i, hidden_size) for i in range(layers_number) ]) self._attention_layers = nn.ModuleList( [nn.Linear(hidden_size, 1) for i in range(layers_number)]) self._dropout_layers = nn.ModuleList( [nn.Dropout((i + 1) / 10.) for i in range(layers_number)]) self._out = FCN(rgb_size + audio_size + hidden_size * layers_number, output_size, (4096, 4096), dropout=0.3) self._num_layers = layers_number
def __init__(self, rgb_size=RGB_FEATURES_SIZE, audio_size=AUDIO_FEATURES_SIZE, output_size=YOUTUBE8M_LABELS_N, rgb_inner_size=1024, audio_inner_size=128, layers_number=1): super().__init__() self._rgb_layers = nn.ModuleList([ nn.LSTM(input_size=rgb_size + 2 * rgb_inner_size * i, hidden_size=rgb_inner_size, num_layers=1, batch_first=True, bidirectional=True) for i in range(layers_number) ]) self._audio_layers = nn.ModuleList([ nn.LSTM(input_size=audio_size + 2 * audio_inner_size * i, hidden_size=audio_inner_size, num_layers=1, batch_first=True, bidirectional=True) for i in range(layers_number) ]) self._first_linear_rgb = nn.Linear(rgb_size, rgb_size) self._first_linear_audio = nn.Linear(audio_size, audio_size) self._rgb_attention = nn.Linear( rgb_size + rgb_inner_size * 2 * layers_number, 1) self._audio_attention = nn.Linear( audio_size + audio_inner_size * 2 * layers_number, 1) self._bn = nn.BatchNorm1d(rgb_size + rgb_inner_size * 2 * layers_number + audio_size + audio_inner_size * 2 * layers_number) self._out = FCN( rgb_size + rgb_inner_size * 2 * layers_number + audio_size + audio_inner_size * 2 * layers_number, output_size, (4096, 4096)) self._num_layers = layers_number