def __init__(self, data, opt, mode='train', image_features=None, fixed_answers_entry=None): self.opt = opt self.data = [] self.mode = mode assert mode in ['train', 'dev', 'test'] error_samples = [] for datum in data: if len(datum['annotated_question']['word']) == 0: error_samples.append(datum['question_id']) continue if mode != 'test' and len(datum['orign_answers']) == 0: error_samples.append(datum['question_id']) continue self.data.append(datum) # global self.opt log.info('Remove {} samples for empty question or answers: {}'.format( len(error_samples), error_samples)) self.opt = opt self.set_dataset() if 'DEBUG' in self.opt: self.debug = True else: self.debug = False self.debug_dataset() self.img_features_cache = {} self.image_features = image_features self.fixed_answers_entry = fixed_answers_entry if 'ES_ocr' in self.opt: self.ocr_name_list = [self.opt['ES_ocr']] + self.ocr_name_list self.es_ocr_len = int(self.opt['ES_ocr_len']) self.es_sort_way = self.opt['ES_sort_way'] log.info('Using OCR from: {}'.format(self.ocr_name_list)) log.info('Using OD from: {}'.format(self.od_name_list)) if 'BERT' in self.opt: if 'BERT_LARGE' in self.opt: log.debug('Using BERT Large model') tokenizer_file = os.path.join( self.opt['datadir'], self.opt['BERT_large_tokenizer_file']) log.debug('Loading tokenizer from {}'.format(tokenizer_file)) self.bert_tokenizer = BertTokenizer.from_pretrained( tokenizer_file) else: log.debug('Using BERT base model') tokenizer_file = os.path.join(self.opt['datadir'], self.opt['BERT_tokenizer_file']) log.debug('Loading tokenizer from {}'.format(tokenizer_file)) self.bert_tokenizer = BertTokenizer.from_pretrained( tokenizer_file)
def __init__(self, opt, data, use_cuda, vocab, char_vocab, evaluation=False): # file_name = os.path.join(self.spacyDir, 'coqa-' + dataset_label + '-preprocessed.json') self.data = data self.use_cuda = use_cuda self.vocab = vocab self.char_vocab = char_vocab self.evaluation = evaluation self.opt = opt if 'PREV_ANS' in self.opt: self.prev_ans = self.opt['PREV_ANS'] else: self.prev_ans = 2 if 'PREV_QUES' in self.opt: self.prev_ques = self.opt['PREV_QUES'] else: self.prev_ques = 0 self.use_char_cnn = 'CHAR_CNN' in self.opt self.bert_tokenizer = None if 'BERT' in self.opt: if 'BERT_LARGE' in opt: print('Using BERT Large model') tokenizer_file = os.path.join(opt['datadir'], opt['BERT_large_tokenizer_file']) print('Loading tokenizer from', tokenizer_file) self.bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_file) else: print('Using BERT base model') tokenizer_file = os.path.join(opt['datadir'], opt['BERT_tokenizer_file']) print('Loading tokenizer from', tokenizer_file) self.bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_file) self.answer_span_in_context = 'ANSWER_SPAN_IN_CONTEXT_FEATURE' in self.opt self.ques_max_len = (30 + 1) * self.prev_ans + (25 + 1) * (self.prev_ques + 1) self.char_max_len = 30 print('*****************') print('prev_ques :', self.prev_ques) print('prev_ans :', self.prev_ans) print('ques_max_len:', self.ques_max_len) print('*****************') c2id = {c: i for i, c in enumerate(char_vocab)} # random shuffle for training if not evaluation: indices = list(range(len(self.data))) random.shuffle(indices) self.data = [self.data[i] for i in indices]
def __init__(self, opt, data, use_cuda, vocab, char_vocab, train_img_id2idx, train_img_features, train_img_spatials, val_img_id2idx, val_img_features, val_img_spatials, mod='train'): # file_name = os.path.join(self.spacyDir, 'coqa-' + dataset_label + '-preprocessed.json') self.data = data self.use_cuda = use_cuda self.vocab = vocab self.char_vocab = char_vocab self.train_img_features = train_img_features self.train_img_id2idx = train_img_id2idx self.train_img_spatials = train_img_spatials self.val_img_features = val_img_features self.val_img_spatials = val_img_spatials self.val_img_id2idx = val_img_id2idx self.img_feature_dim = opt['img_fea_dim'] self.img_spatial_dim = opt['img_spa_dim'] self.img_fea_num = opt['img_fea_num'] self.use_img_feature = 'img_feature' in opt if 'ModelParallel' in opt: self.bert_cuda = 'cuda:{}'.format(opt['ModelParallel'][-1]) self.main_cuda = 'cuda:{}'.format(opt['ModelParallel'][0]) self.ocr_name_list = opt['ocr_name_list'].split(',') if 'ES_ocr' in opt: self.ocr_name_list = [opt['ES_ocr']] + self.ocr_name_list self.es_ocr_len = int(opt['ES_ocr_len']) self.es_sort_way = opt['ES_sort_way'] else: self.es_ocr_len = None error_ocr_name = [] for ocr_name in self.ocr_name_list: if ocr_name not in self.data[0]: error_ocr_name.append(ocr_name) if len(error_ocr_name) != 0: log.error('OCR name ERROR: ' + str(error_ocr_name)) assert False else: log.info('Using OCR from: ' + str(self.ocr_name_list)) self.mod = mod # if mod == 'train': # self.evaluation = False # else: # self.evaluation = True self.opt = opt if 'PREV_ANS' in self.opt: self.prev_ans = self.opt['PREV_ANS'] else: self.prev_ans = 2 if 'PREV_QUES' in self.opt: self.prev_ques = self.opt['PREV_QUES'] else: self.prev_ques = 0 self.use_char_cnn = 'CHAR_CNN' in self.opt self.bert_tokenizer = None if 'BERT' in self.opt: if 'BERT_LARGE' in opt: print('Using BERT Large model') tokenizer_file = os.path.join(opt['datadir'], opt['BERT_large_tokenizer_file']) print('Loading tokenizer from', tokenizer_file) self.bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_file) else: # print('Using BERT base model') tokenizer_file = os.path.join(opt['datadir'], opt['BERT_tokenizer_file']) # print('Loading tokenizer from', tokenizer_file) self.bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_file) self.answer_span_in_context = 'ANSWER_SPAN_IN_CONTEXT_FEATURE' in self.opt self.ques_max_len = (30 + 1) * self.prev_ans + (25 + 1) * (self.prev_ques + 1) self.char_max_len = 30 # print('*****************') # print('prev_ques :', self.prev_ques) # print('prev_ans :', self.prev_ans) # print('ques_max_len:', self.ques_max_len) # print('*****************') c2id = {c: i for i, c in enumerate(char_vocab)} self.od_name_list = opt['od_name_list'].split(',') error_od_name = [] for od_name in self.od_name_list: if od_name not in self.data[0]: error_od_name.append(od_name) if len(error_od_name) != 0: log.error('OD name ERROR: ' + error_ocr_name) assert False else: log.info('Using OD from: ' + str(self.od_name_list))