예제 #1
0
def get_nets():
    import vg.defn.three_way2 as D
    net_base = load_best_run('{}/s2-t.-s2i2-s2t.-t2s.-t2i.'.format(PREFIX),
                             cond='')
    net_mt = load_best_run('{}/s2-t1-s2i2-s2t0-t2s0-t2i1'.format(PREFIX),
                           cond='joint')
    config = dict(TextImage=dict(ImageEncoder=dict(size=1024,
                                                   size_target=4096),
                                 lr=0.0002,
                                 margin_size=0.2,
                                 max_norm=2.0,
                                 TextEncoderTop=dict(size=1024,
                                                     size_feature=1024,
                                                     depth=1,
                                                     size_attn=128)),
                  SpeechImage=dict(ImageEncoder=dict(size=1024,
                                                     size_target=4096),
                                   lr=0.0002,
                                   margin_size=0.2,
                                   max_norm=2.0,
                                   SpeechEncoderTop=dict(size=1024,
                                                         size_input=1024,
                                                         depth=2,
                                                         size_attn=128)),
                  SpeechText=dict(TextEncoderTop=dict(size_feature=1024,
                                                      size=1024,
                                                      depth=0,
                                                      size_attn=128),
                                  SpeechEncoderTop=dict(size=1024,
                                                        size_input=1024,
                                                        depth=0,
                                                        size_attn=128),
                                  lr=0.0002,
                                  margin_size=0.2,
                                  max_norm=2.0),
                  SpeechEncoderBottom=dict(size=1024,
                                           depth=2,
                                           size_vocab=13,
                                           filter_length=6,
                                           filter_size=64,
                                           stride=2),
                  TextEncoderBottom=dict(
                      size_feature=net_mt.TextEncoderBottom.size_feature,
                      size_embed=128,
                      size=1024,
                      depth=1))
    net_base = load_best_run('{}/s2-t.-s2i2-s2t.-t2s.-t2i.'.format(PREFIX),
                             cond='')
    net_mt = load_best_run('{}/s2-t1-s2i2-s2t0-t2s0-t2i1'.format(PREFIX),
                           cond='joint')
    net_init = D.Net(config).cuda()
    return [('m6_init', net_init), ('m1', net_base), ('m6', net_mt)]
예제 #2
0
                    SpeechEncoderBottom=dict(size=1024, depth=2, size_vocab=13, filter_length=6, filter_size=64, stride=2),
                    TextEncoderBottom=dict(size_feature=data_flickr.mapper.size(),
                                           size_embed=128,
                                           size=1024,
                                           depth=1)
                   )






def audio(sent):
    return sent['audio']

net = D.Net(model_config)
net.batcher = None
net.mapper = None

scorer = vg.scorer.Scorer(prov_flickr, 
                    dict(split='val', 
                         tokenize=audio, 
                         batch_size=batch_size
                         ))
                  

run_config = dict(epochs=epochs,
                  validate_period=400,
                  tasks=[ ('SpeechText', net.SpeechText),
                          ('SpeechImage', net.SpeechImage),
                          ('TextImage', net.TextImage)],