示例#1
0
    def __init__(self, args, logger):

        self.args = args
        self.logger = logger
        Dict = Dictionary(data_path=os.path.join(args.data_path, args.dataset),
                          task_type=args.task_type)
        self.dict = Dict.dict
        self.attr_len = Dict.attr_len
        self.all_the_poss = reduce(mul, Dict.attr_len, 1)
        self.logger.info("Experiment initializing . . . ")

        # build models
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        if args.model_type == 'POP':
            self.model = 'POP'
        elif any(
            [True if args.model_type == m else False
             for m in ['ETN', 'ETNA']]):
            self.model = ETNADemoPredictor(logger, args.model_type,
                                           self.dict.__len__(),
                                           args.item_emb_size, Dict.attr_len,
                                           args.no_cuda).to(device)
        else:
            sys.exit()

        if args.model_type != 'POP':
            self.select_optimizer(self.model)
        self.logger.info(self.model)
        self.step_count = 0
示例#2
0
def create_dictionary(dataroot):
    dictionary = Dictionary()
    #general questions
    files = [
        'imsitu_questions_prev.json'
    ]

    for path in files:
        question_path = os.path.join(dataroot, path)
        q_data = json.load(open(question_path))

        for verb, values in q_data.items():
            roles = values['roles']
            for role, info in roles.items():
                question = info['question']
                dictionary.tokenize(question, True)

    #tempalted words
    with open(os.path.join(dataroot, 'role_abstracts.txt')) as f:
        content = f.readlines()
    verb_desc = [x.strip() for x in content]

    for desc in verb_desc:
        dictionary.tokenize(desc, True)
    #labels
    question_path = os.path.join(dataroot, 'all_label_mapping.json')
    q_data = json.load(open(question_path))

    for label, eng_name in q_data.items():
        dictionary.tokenize(eng_name, True)

    return dictionary
示例#3
0
def create_question_explain_dictionary(dataroot, thres):
    dictionary = Dictionary()
    counter = Counter()
    question_files = [
        'v2_OpenEnded_mscoco_train2014_questions.json',
        'v2_OpenEnded_mscoco_val2014_questions.json',
        'v2_OpenEnded_mscoco_test2015_questions.json',
        'v2_OpenEnded_mscoco_test-dev2015_questions.json'
    ]
    explain_files = [
        'VQA-E_train_set.json',
        'VQA-E_val_set.json',
    ]
    for path in explain_files:
        explain_path = os.path.join(dataroot, path)
        es = json.load(open(explain_path))
        for e in es:
            counter.update(dictionary.word_token(e['explanation'][0]))

    dictionary.add_word('<pad>')
    dictionary.add_word('<start>')
    dictionary.add_word('<end>')
    dictionary.add_word('<unk>')
    for word, cnt in counter.items():
        if cnt >= thres:
            dictionary.add_word(word)
    for path in question_files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)

    return dictionary
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = ['imsitu_questions_prev.json']

    for path in files:
        question_path = os.path.join(dataroot, path)
        q_data = json.load(open(question_path))

        for verb, values in q_data.items():
            roles = values['roles']
            for role, info in roles.items():
                question = info['question']
                dictionary.tokenize(question, True)

    #add all collected words from imsitu. contains both overlaps with vqa as well as new words
    imsitu_words_path = os.path.join(
        dataroot, 'allnverbsall_imsitu_words_nl2glovematching.json')
    imsitu_words = json.load(open(imsitu_words_path))

    for label, eng_name in imsitu_words.items():
        dictionary.tokenize(eng_name, True)

    print(' with words coming from imsitu ', dictionary.__len__())

    return dictionary
示例#5
0
def create_dictionary(dataroot, dataset, old_dictionary=None, args=None):
    dictionary = Dictionary()
    if old_dictionary is not None:
        print("Copying old dictionary to new dictionary")
        dictionary.word2idx = old_dictionary.word2idx
        dictionary.idx2word = old_dictionary.idx2word

    file_names = [
        'train_questions.json', 'val_questions.json', 'test_questions.json'
    ]

    if dataset.lower() == 'vqa2':
        file_names.append('test_dev_questions.json')

    files = []
    for f in file_names:
        files.append(os.path.join(dataroot, 'vqa2', f))

    if args.combine_with is not None:
        for cs in args.combine_with_splits:
            files.append(
                os.path.join(args.combine_with_dataroot, 'vqa2',
                             cs + "_questions.json"))

    print("files to process {}".format(files))

    for question_path in files:
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)

    return dictionary
示例#6
0
def create_dictionary(dataroot, task='vqa'):
    dictionary = Dictionary()
    if task == 'vqa':
        files = [
            'v2_OpenEnded_mscoco_train2014_questions.json',
            'v2_OpenEnded_mscoco_val2014_questions.json',
            'v2_OpenEnded_mscoco_test2015_questions.json',
            'v2_OpenEnded_mscoco_test-dev2015_questions.json'
        ]
        for path in files:
            question_path = os.path.join(dataroot, path)
            qs = json.load(open(question_path))['questions']
            for q in qs:
                dictionary.tokenize(q['question'], True)

    elif task == 'flickr':
        files = [
            'train_ids.pkl',
            'val_ids.pkl',
            'test_ids.pkl',
        ]
        sentence_dir = os.path.join(dataroot, 'Flickr30kEntities/Sentences')

        for path in files:
            ids_file = os.path.join(dataroot, path)

            with open(ids_file, 'rb') as f:
                imgids = cPickle.load(f)

            for image_id in imgids:
                question_path = os.path.join(sentence_dir, '%d.txt' % image_id)
                phrases = get_sent_data(question_path)
                for phrase in phrases:
                    dictionary.tokenize(phrase, True)
    return dictionary
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'v2_OpenEnded_mscoco_train2014_questions.json',
        'v2_OpenEnded_mscoco_val2014_questions.json',
        'v2_OpenEnded_mscoco_test2015_questions.json',
        'v2_OpenEnded_mscoco_test-dev2015_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)

    print('words coming from vqa ', dictionary.__len__())

    #add all collected words from imsitu. contains both overlaps with vqa as well as new words
    imsitu_words_path = os.path.join(
        dataroot, 'allnverbs_imsitu_words_nl2vqamatching.json')
    imsitu_words = json.load(open(imsitu_words_path))

    for label, eng_name in imsitu_words.items():
        dictionary.tokenize(eng_name, True)

    print(' with words coming from imsitu ', dictionary.__len__())

    return dictionary
示例#8
0
def create_dictionary2(dataroot):
    dictionary = Dictionary()
    questions = []
    files = ['train/questions.txt', 'train/questions.txt']
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = open(question_path)
        qs = qs.read().split("\n")
        for q in qs:
            dictionary.tokenize(q, True)
    return dictionary
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = ['VQA_caption_traindataset.pkl', 'VQA_caption_valdataset.pkl']
    for path in files:
        question_path = os.path.join(dataroot, path)
        dataset = cPickle.load(open(question_path, 'rb'))
        for idx in range(len(dataset)):
            captions = dataset[idx]['caption']
            for cap in captions:
                dictionary.tokenize(cap, True)
    return dictionary
示例#10
0
    def dispatch(cls, key, request):
        if key is None or request is None:
            raise Exception

        kwargs = get_params(request)

        params = Dictionary()
        for k in kwargs:
            params.set(k, kwargs[k])
        params.filter()

        return cls.hand_logic(params, key, request)
示例#11
0
def create_dictionary(dataroot):
    dictionary = Dictionary()
    files = ['allwords4verbq1.json']

    for path in files:
        question_path = os.path.join(dataroot, path)
        q_data = json.load(open(question_path))

        for label, eng_name in q_data.items():
            dictionary.tokenize(eng_name, True)

    return dictionary
示例#12
0
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'v2_OpenEnded_mscoco_train2014_questions.json',
        'v2_OpenEnded_mscoco_val2014_questions.json',
        'v2_OpenEnded_mscoco_test2015_questions.json',
        'v2_OpenEnded_mscoco_test-dev2015_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)
    return dictionary
示例#13
0
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'OpenEnded_abstract_v002_test2015_questions.json',
        'OpenEnded_abstract_v002_train2015_questions.json',
        'OpenEnded_abstract_v002_val2015_questions.json',
        'MultipleChoice_abstract_v002_test2015_questions.json',
        'MultipleChoice_abstract_v002_train2015_questions.json',
        'MultipleChoice_abstract_v002_val2015_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)
    return dictionary
示例#14
0
def load_model_data(config, is_train=True, eval_name="val"):
    # data load
    dictionary = Dictionary()
    embedding_weight = dictionary.create_glove_embedding_init(
        pre=True, pre_dir='../data/vocabs/embedding_weight.npy')
    if is_train:
        train_dset = TextVQA('train', dictionary)
        eval_dset = TextVQA('val', dictionary)
        test_dset = None
        if eval_name == "test":
            test_dset = TextVQA('test', dictionary)
        model = build_model(train_dset, config['model_attributes'])
        return model, train_dset, eval_dset, embedding_weight, test_dset
    else:
        eval_dset = TextVQA(eval_name, dictionary)
        model = build_model(eval_dset, config['model_attributes'])
        return model, eval_dset
示例#15
0
def create_dictionary(dataroot, tk='mecab'):
    dictionary = Dictionary()
    if tk == 'mecab':
        tokenizer = Mecab()
    elif tk == 'kkma':
        tokenizer = Kkma()
    files = [
        'KVQA_annotations_train.json', 'KVQA_annotations_val.json',
        'KVQA_annotations_test.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path, encoding='utf-8'))
        for q in qs:
            dictionary.tokenize(tokenize_kvqa(q['question']), True,
                                tokenizer.morphs)
    return dictionary
def create_dictionary(dataroot):
    dictionary = Dictionary()
    files = [
        'imsitu_questions_prev.json'
    ]

    for path in files:
        question_path = os.path.join(dataroot, path)
        q_data = json.load(open(question_path))

        for verb, values in q_data.items():
            roles = values['roles']
            for role, info in roles.items():
                question = info['question']
                dictionary.tokenize(question, True)

    return dictionary
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = ['vqacp_v2_train_questions.json',
        'vqacp_v2_test_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))
        for q in qs:
            dictionary.tokenize(q['question'], True)
            if 'train' in path:
                try:
                    dictionary.tokenize(q['orig_question'], True)
                except:
                    continue

    return dictionary
示例#18
0
def create_dictionary(dataroot):
    dictionary = Dictionary()
    role_name_corrector = 'data/roles_namecorrected.json'
    role_name_dict = json.load(open(role_name_corrector))
    files = [
        'imsitu_questions_prev.json'
    ]

    for path in files:
        question_path = os.path.join(dataroot, path)
        q_data = json.load(open(question_path))

        for verb, values in q_data.items():
            roles = values['roles']
            for role, info in roles.items():
                question = role_name_dict[role]
                dictionary.tokenize(question, True)

    return dictionary
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'v2_OpenEnded_mscoco_train2014_questions.json',
        'v2_OpenEnded_mscoco_val2014_questions.json',
        'v2_OpenEnded_mscoco_test2015_questions.json',
        'v2_OpenEnded_mscoco_test-dev2015_questions.json',
        'how_many_qa/HowMany-QA/qzcreate.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))
        if "HowMany-QA" not in path:
            qs = qs['questions']
        for q in qs:
            if 'question' in q:
                dictionary.tokenize(q['question'], True)
        print(path, " is ok")
    return dictionary
示例#20
0
def create_explain_dictionary(dataroot, thres):
    dictionary = Dictionary()
    counter = Counter()
    files = [
        'VQA-E_train_set.json',
        'VQA-E_val_set.json',
    ]
    for path in files:
        explain_path = os.path.join(dataroot, path)
        es = json.load(open(explain_path))
        for e in es:
            counter.update(dictionary.word_token(e['explanation'][0]))

    dictionary.add_word('<pad>')
    dictionary.add_word('<start>')
    dictionary.add_word('<end>')
    dictionary.add_word('<unk>')
    for word, cnt in counter.items():
        if cnt >= thres:
            dictionary.add_word(word)
    return dictionary
示例#21
0
def create_caption_dictionary(dataroot, thres):
    dictionary = Dictionary()
    counter = Counter()
    files = [
        'captions_train2014.json',
        'captions_val2014.json',
    ]
    for path in files:
        caption_path = os.path.join(dataroot, path)
        qs = json.load(open(caption_path))['annotations']
        for q in qs:
            counter.update(dictionary.word_token(q['caption']))

    dictionary.add_word('<pad>')
    dictionary.add_word('<start>')
    dictionary.add_word('<end>')
    dictionary.add_word('<unk>')
    for word, cnt in counter.items():
        if cnt >= thres:
            dictionary.add_word(word)
    return dictionary
示例#22
0
def create_VQAX_explain_dictionary(dataroot, thres):
    dictionary = Dictionary()
    counter = Counter()
    files = [
        'train_exp_anno.json',
        'val_exp_anno.json',
        'test_exp_anno.json',
    ]
    for path in files:
        explain_path = os.path.join(dataroot, path)
        es = json.load(open(explain_path))
        for e in es.items():
            for E in e[1]:
                counter.update(dictionary.word_token(E))

    dictionary.add_word('<pad>')
    dictionary.add_word('<start>')
    dictionary.add_word('<end>')
    dictionary.add_word('<unk>')
    for word, cnt in counter.items():
        if cnt >= thres:
            dictionary.add_word(word)
    return dictionary
示例#23
0
def create_dictionary(dataroot, only_image_questions):
    dictionary = Dictionary()
    questions = []
    files = [
        'official_aaai_split_train_data.json',
        'v2_OpenEnded_mscoco_train2014_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        if path == 'official_aaai_split_train_data.json':
            if only_image_questions:
                qs = [example for example in json.load(open(question_path)) if example['q_type'] == 'image']
            else:
                qs = [example for example in json.load(open(question_path)) if example['image'] is not None]
        else:
            qs = json.load(open(question_path))['questions']
            caps = [dia['caption'] for dia in json.load(open(os.path.join(dataroot, 'visdial_1.0_train.json')))['data']['dialogs']]
            for cap in caps:
                dictionary.tokenize(cap, True)
        for example in qs:
            dictionary.tokenize(example['question'], True)
            if path == 'official_aaai_split_train_data.json':
                dictionary.tokenize(example['image']['caption'], True)
    return dictionary
示例#24
0
 def __init__(self):
     self.__name = "item"
     self.__id = ""
     self.__attr = Dictionary()
     self.__children = List()
示例#25
0
    args = get_args()
    print(args)

    # set the random seed manually for reproducibility
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print("WARNING: You have a CUDA device, so you should probably run with --cuda")
        else:
            torch.cuda.manual_seed(args.seed)

        # Load Dictionary
    assert os.path.exists(args.train_data)
    assert os.path.exists(args.val_data)

    dictionary = Dictionary(join_path(data_dir,'data/atec_nlp_sim_train.csv'))
    args.vocab_size = len(dictionary)
    best_val_loss = None
    best_f1 = None
    n_token = len(dictionary)
    model = ESIM(args)
    if torch.cuda.is_available():
        model = model.cuda()
    print(model)

    print('Begin to load data.')
    train_data = MyDataset(args.train_data, args.sequence_length, dictionary.word2idx, args.char_model)
    val_data = MyDataset(args.val_data, args.sequence_length, dictionary.word2idx, args.char_model)
    train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=16)
    val_loader = DataLoader(val_data, batch_size=1, shuffle=False)
    try:
示例#26
0
import baseline
from train import train
import utils
import config
import os

dirs_list = ['./info', './save_models']

if __name__ == '__main__':
    opt = config.parse_opt()
    torch.cuda.set_device(1)
    torch.manual_seed(opt.SEED)
    torch.cuda.manual_seed(opt.SEED)
    torch.backends.cudnn.bechmark = True

    dictionary = Dictionary({'Yes': 0}, ['Yes'])
    dictionary.init_dict()

    train_set = FeatureDataset('Action', dictionary, 'Train')
    test_set = FeatureDataset('Action', dictionary, 'Test')
    constructor = 'build_baseline'
    model = getattr(baseline, constructor)(train_set, opt).cuda()
    model.w_emb.init_embedding()

    train_loader = DataLoader(train_set,
                              opt.BATCH_SIZE,
                              shuffle=True,
                              num_workers=1)
    test_loader = DataLoader(test_set,
                             opt.BATCH_SIZE,
                             shuffle=True,
    emb_dim = len(entries[0].split(' ')) - 1
    print('embedding dim is %d' % emb_dim)
    weights = np.zeros((len(idx2word), emb_dim), dtype=np.float32)

    for entry in entries:
        vals = entry.split(' ')
        word = vals[0]
        vals = map(float, vals[1:])
        word2emb[word] = np.array(vals)
    for idx, word in enumerate(idx2word):
        if word not in word2emb:
            continue
        weights[idx] = word2emb[word]
    return weights, word2emb


if __name__ == '__main__':

    caption_dictionary = Dictionary()
    caption_dictionary.add_word('<pad>')
    caption_dictionary.add_word('<unk>')
    caption_dictionary = create_dictionary(caption_dictionary)
    caption_dictionary.dump_to_file('caption_dictionary.pkl')
    emb_dim = 300
    glove_file = 'h5data/glove/glove.6B.%dd.txt' % emb_dim
    #with open('/data/wujial/Attention-on-Attention-for-VQA/data/cache/trainval_label2ans.pkl', 'rb') as f:
    #    x = pickle.load(f)
    weights, word2emb = create_glove_embedding_init(
        caption_dictionary.idx2word, glove_file)
    np.save('glove6b_caption_init_%dd.npy' % emb_dim, weights)
示例#28
0
    args = get_args()
    print(args)

    # Set the random seed manually for reproducibility.
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print("WARNING: You have a CUDA device, so you should probably run with --cuda")
        else:
            torch.cuda.manual_seed(args.seed)

    # Load Dictionary
    assert os.path.exists(args.train_data)
    assert os.path.exists(args.val_data)
    print('Begin to load the dictionary.')
    dictionary = Dictionary('../data/atec_nlp_sim_train.csv')

    args.vocab_size = len(dictionary)

    best_val_loss = None
    best_f1 = None
    n_token = len(dictionary)

    embedding_net = EmbeddingCNN(args)
    print("embedding_net: {}".format(embedding_net))
    model = SiameseNet(embedding_net)
    print(model)

    print('Begin to load data.')
    train_data = MyDataset(args.train_data, args.sequence_length, dictionary.word2idx, args.char_model)
    val_data = MyDataset(args.val_data, args.sequence_length, dictionary.word2idx, args.char_model)
def create_dictionary(question):
    dictionary = Dictionary()

    dictionary.tokenize(question, True)

    return dictionary
示例#30
0
from dataset import Dictionary

if __name__ == '__main__':
    d = Dictionary()
    all_sent = d.get_all_sentence()

    print all_sent[0], all_sent[1]
    token1 = d.tokenize(all_sent[0], False)
    token2 = d.tokenize(all_sent[1], False)
    print token1, token2