print('using cpu')


# flickr doesnt need to be split at the root node
def iterate_data(h5_file):
    for x in h5_file.root:
        yield x


f_nodes_mfcc = [node for node in iterate_data(data_file)]

f_nodes_flickr = [node for node in iterate_data(flickr_file)]

# split the database into train test and validation sets. default settings uses the json file
# with the karpathy split
train, val, test = split_data_flickr(f_nodes_flickr, args.split_loc)

textcp = pd.read_csv(
    "/Users/sebastiaanscholten/Documents/speech2image-master/vgsexperiments/experiments/Results_isolated_word_recognition/documents/textcp.csv"
)

testlist = [
    "dog", "man", "boy", "girl", "woman", "people", "dogs", "shirt", "child",
    "ball", "person", "children", "men", "girls", "bike", "rock", "camera",
    "boys", "hat", "player", "jacket", "basketball", "swing", "car", "wall",
    "hair", "football", "sunglasses", "head", "shorts", "dress", "table",
    "water", "grass", "bench", "snow", "air", "field", "street", "mouth",
    "dirt", "mountain", "pool", "ocean", "sand", "building", "soccer", "park",
    "face"
]
示例#2
0
# check if cuda is availlable and user wants to run on gpu
cuda = args.cuda and torch.cuda.is_available()
if cuda:
    print('using gpu')
else:
    print('using cpu')

# flickr doesnt need to be split at the root node
def iterate_data(h5_file):
    for x in h5_file.root:
        yield x
f_nodes = [node for node in iterate_data(data_file)] 

# split the database into train test and validation sets. default settings uses the json file
# with the karpathy split
train, test, val = split_data_flickr(f_nodes, args.split_loc)

############################### Neural network setup #################################################

# network modules
img_net = img_encoder(image_config)
cap_net = audio_rnn_encoder(audio_config)

# Adam optimiser. I found SGD to work terribly and could not find appropriate parameter settings for it.
optimizer = torch.optim.Adam(list(img_net.parameters())+list(cap_net.parameters()), 1)

#plateau_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.9, patience = 100, 
#                                                   threshold = 0.0001, min_lr = 1e-8, cooldown = 100)

#step_scheduler = lr_scheduler.StepLR(optimizer, 1000, gamma=0.1, last_epoch=-1)
            # images should be shape (batch_size, 1024). images_shape[1] is collapsed as the original features are of shape (1,1024)
            images = np.float64(
                np.reshape(images, (images_shape[0], images_shape[2])))
            yield images, speech, caption, lengths


# load the word frequency dictionary
f_dict = load_obj(dict_path)
# select words which occur between 50 and a 1000 times and are over 3 characters long
words = select(f_dict, 50, 1000, 3)
vocab_size = len(words)
# open and load the data
data_file = tables.open_file(data_loc, mode='r+')
f_nodes = [node for node in iterate_flickr(data_file)]
# split the data
train, val, test = split_data_flickr(f_nodes, split_loc)


################################network config##################################
# rnn encoder for audio (mfcc, mbn etc.)
class audio_rnn_encoder(nn.Module):
    def __init__(self, config):
        super(audio_rnn_encoder, self).__init__()
        conv = config['conv']
        rnn = config['rnn']
        att = config['att']
        self.Conv = nn.Conv1d(in_channels=conv['in_channels'],
                              out_channels=conv['out_channels'],
                              kernel_size=conv['kernel_size'],
                              stride=conv['stride'],
                              padding=conv['padding'])