def _process_vocab(args, questions) -> Dict: """If input_vocab_json is provided, then use (or expand) it, o.w. build vocab from train files""" # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: logger.info('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = preprocess_utils.build_vocab( (q['answer'] for q in questions)) question_token_to_idx = preprocess_utils.build_vocab( (q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = preprocess_utils.build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': logger.info('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json) as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: logger.info('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 logger.info('Found %d new words' % num_new_words) if args.output_vocab_json != '': utils.mkdirs(os.path.dirname(args.output_vocab_json)) with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) return vocab
def test(): _, _, _, sentence_size, vocab_size = build_corpus() v2i, _ = build_vocab() _, i2l = build_label() origin_questions = ['今天 天气 不错', '介绍 贵金属 产品'] questions = [q.split() for q in origin_questions] questions = [[v2i[vocab] for vocab in ques if vocab in v2i] for ques in questions] config = tf.ConfigProto() with tf.Session(config=config) as sess: model = Model(sentence_size, vocab_size, FLAGS.embed_size, FLAGS.class_num, FLAGS.learning_rate, FLAGS.decay_step, FLAGS.decay_rate, FLAGS.layer_size, FLAGS.multi_channel_size) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(FLAGS.check_point)) questions = pad_sequences(questions, maxlen=sentence_size, value=0) feed_dict = { model.encoder_input: questions, model.batch_size: FLAGS.batch_size } p = sess.run([model.predict], feed_dict=feed_dict) p = p[0].tolist() for index in range(len(questions)): print(f'{origin_questions[index]} is_business: {i2l[p[index]]}')
def process_dataset(): train_dataset = dp.load_dataset(args.data, is_english=True, has_label=True, use_target='word', use_first_target=False) # Y = [ins['label'] for ins in train_dataset] # class_distri = class_weight.compute_class_weight('balanced', np.unique(Y), Y) # print('Set class weights: {}'.format(class_distri)) val_dataset = None if args.dev: val_dataset = dp.load_dataset(args.dev, is_english=True, has_label=True, use_target='word', use_first_target=False) full_dataset = train_dataset + val_dataset else: full_dataset = train_dataset args.text_seq_len = max([len(ins['text_words']) for ins in full_dataset]) args.tar_seq_len = max([len(ins['tar_words']) for ins in full_dataset]) print('text seq len: ', args.text_seq_len) print('tar seq len: ', args.tar_seq_len) args.vocab_size, token2id, args.embeddings, args.emb_dim = dp.build_vocab( args.vocab, data=[ins['text_words'] for ins in full_dataset], embedding=args.emb, tf_limit=args.tf_limit) # list of tuple (encoded_text, encoded_target, encoded_idx, encoded_label) train_encoded = dp.build_dataset(train_dataset, token2id, args.cat2id, args.text_seq_len, args.tar_seq_len) np.random.shuffle(train_encoded) if args.dev: val_encoded = dp.build_dataset(val_dataset, token2id, args.cat2id, args.text_seq_len, args.tar_seq_len) else: train_encoded, val_encoded = train_test_split(train_encoded, test_size=0.2, random_state=1314) if pad_dataset: train_encoded = dp.pad_dataset(train_encoded, args.batch_size) train_text, train_target, train_tar_idx, train_label = map( lambda filed: np.array(filed), zip(*train_encoded)) val_text, val_target, val_tar_idx, val_label = map( lambda filed: np.array(filed), zip(*val_encoded)) # load test dataset test_dataset = dp.load_dataset(args.test, is_english=True, has_label=True, use_target='word', use_first_target=False) test_encoded = dp.build_dataset(test_dataset, token2id, args.cat2id, args.text_seq_len, args.tar_seq_len) test_text, test_target, test_tar_idx, test_label = map( lambda filed: np.array(filed), zip(*test_encoded)) return args, (train_text, train_target, train_tar_idx, train_label), (val_text, val_target, val_tar_idx, val_label), \ (test_text, test_target, test_tar_idx, test_label)
def process_dataset(): train_dataset = dp.load_dataset(args.data, is_english=True, has_label=True, use_target='word', use_first_target=False) val_dataset = None if args.dev: val_dataset = dp.load_dataset(args.dev, is_english=True, has_label=True, use_target='word', use_first_target=False) full_dataset = train_dataset + val_dataset else: full_dataset = train_dataset args.text_seq_len = max([len(ins['text_words']) for ins in full_dataset]) args.tar_seq_len = max([len(ins['tar_words']) for ins in full_dataset]) print('text seq len: ', args.text_seq_len) print('tar seq len: ', args.tar_seq_len) args.vocab_size, token2id, args.embeddings, args.emb_dim = dp.build_vocab( args.vocab, data=[ins['text_words'] for ins in full_dataset], embedding=args.emb, tf_limit=args.tf_limit) # list of tuple (encoded_text, encoded_target, encoded_idx, encoded_label) train_encoded = dp.build_dataset(train_dataset, token2id, args.cat2id, args.text_seq_len, args.tar_seq_len) np.random.shuffle(train_encoded) if args.dev: val_encoded = dp.build_dataset(val_dataset, token2id, args.cat2id, args.text_seq_len, args.tar_seq_len) else: train_encoded, val_encoded = train_test_split(train_encoded, test_size=0.2, random_state=1234) if pad_dataset: train_encoded = dp.pad_dataset(train_encoded, args.batch_size) train_text, train_target, train_tar_idx, train_label = map( lambda filed: np.array(filed), zip(*train_encoded)) val_text, val_target, val_tar_idx, val_label = map( lambda filed: np.array(filed), zip(*val_encoded)) return args, (train_text, train_target, train_tar_idx, train_label), (val_text, val_target, val_tar_idx, val_label)
def load_vocabulary(): if os.path.exists(conf['vocab_path']) and not conf['first_run?']: # Trick for allow_pickle issue in np.load np_load_old = np.load # modify the default parameters of np.load np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) # need to use .item() to access a class object vocab = np.load(conf['vocab_path']).item() # restore np.load for future normal usage np.load = np_load_old print('Loaded vocabulary.') else: # build a single vocab for both the languages print('Building vocabulary...') vocab = preprocess.build_vocab(conf) print('Built vocabulary.') return vocab
def process_dataset_cv(): train_dataset = dp.load_dataset(args.data, is_english=True, has_label=True, use_target='word', use_first_target=False) if args.test: test_dataset = dp.load_dataset(args.test, is_english=True, has_label=True, use_target='word', use_first_target=False) full_dataset = train_dataset + test_dataset else: full_dataset = train_dataset args.text_seq_len = max([len(ins['text_words']) for ins in full_dataset]) args.tar_seq_len = max([len(ins['tar_words']) for ins in full_dataset]) print('text seq len: ', args.text_seq_len) print('tar seq len: ', args.tar_seq_len) args.vocab_size, token2id, args.embeddings, args.emb_dim = dp.build_vocab(args.vocab, data=[ins['text_words'] for ins in full_dataset], embedding=args.emb, tf_limit=args.tf_limit) # list of tuple (encoded_text, encoded_target, encoded_idx, encoded_label) data_encoded = dp.build_dataset(full_dataset, token2id, args.cat2id, args.text_seq_len, args.tar_seq_len) text, target, tar_idx, label = map(lambda filed: np.array(filed), zip(*data_encoded)) Y = [ins['label'] for ins in full_dataset] fold = list(StratifiedKFold(n_splits=args.kfold, shuffle=True, random_state=1234).split(text, Y)) return args, fold, (text, target, tar_idx), label
def test(): v2i, _ = build_vocab() _, i2l = build_label() origin_questions = ['今天 天气 不错', '介绍 贵金属 产品'] questions = [q.split() for q in origin_questions] questions = [[v2i[vocab] for vocab in ques if vocab in v2i] for ques in questions] with tf.Session() as sess: saver = tf.train.import_meta_graph(checkpoint_path + model_name) saver.restore(sess, tf.train.latest_checkpoint(checkpoint_path)) model = tf.get_default_graph() x = model.get_tensor_by_name("x:0") predict = model.get_tensor_by_name("predictions:0") questions = pad_sequences(questions, maxlen=x.shape[1], value=0) feed_dict = {x: questions} p = sess.run([predict], feed_dict=feed_dict) p = p[0].tolist() for index in range(len(questions)): print(f'{origin_questions[index]} is_business: {i2l[p[index]]}')
def get_data(args): path = Path(args.data_path) f_train = path / 'train.json' f_test = path / 'test.json' f_val = path / 'val.json' tokenizer = Tokenizer() vocab = build_vocab([f_train, f_test, f_val], tokenizer=tokenizer.tokenize, min_freq=2, max_size=50000) train_ds = ClassificationDataset(fname=f_train, tokenizer=tokenizer.tokenize, vocab=vocab) test_ds = ClassificationDataset(fname=f_test, tokenizer=tokenizer.tokenize, vocab=vocab) val_ds = ClassificationDataset(fname=f_val, tokenizer=tokenizer.tokenize, vocab=vocab) collator = ClfPadCollator(args.max_seq_length) train_iter = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, collate_fn=collator.collate) test_iter = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collator.collate) val_iter = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collator.collate) return train_iter, val_iter, test_iter, vocab
def main(args): print('Loading captions') with open(args.input_captions_json, 'r') as f: captions = json.load(f) with open(args.input_neg_captions_json, 'r') as f: neg_captions = json.load(f) with open(args.split_json, 'r') as f: splits = json.load(f) all_imgs = sorted(os.listdir(args.input_image_dir)) captioned_imgs = list(captions.keys()) all_captions = [] for img, caps in captions.items(): all_captions.extend(caps) all_neg_captions = [] for img, caps in neg_captions.items(): all_neg_captions.extend(caps) # Extract train data points train_split = splits['train'] train_imgs = [all_imgs[idx] for idx in train_split] train_captions = [] train_neg_captions = [] for img in train_imgs: cap = captions[img] neg_cap = neg_captions[img] train_captions.extend(cap) train_neg_captions.extend(neg_cap) N = len(all_imgs) N_captioned = len(captions) M = len(all_captions) M_neg = len(all_neg_captions) print('Total images: %d' % N) print('Total captioned images: %d' % N_captioned) print('Total captions: %d' % M) print('Total negative captions: %d' % M_neg) print('Total train images: %d' % len(train_imgs)) print('Total train captions: %d' % len(train_captions)) print('Total train neg captions: %d' % len(train_neg_captions)) # Either create the vocab or load it from disk if args.input_vocab_json == '': print('Building vocab') word_to_idx = build_vocab(train_captions + train_neg_captions, min_token_count=args.word_count_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) else: print('Loading vocab') with open(args.input_vocab_json, 'r') as f: word_to_idx = json.load(f) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(word_to_idx, f) # Encode all captions # First, figure out max length of captions all_cap_tokens = [] max_length = -1 cap_keys = sorted(list(captions.keys())) for img in cap_keys: caps = captions[img] n = len(caps) assert n > 0, 'error: some image has no caption' tokens_list = [] for cap in caps: cap_tokens = tokenize(cap, add_start_token=True, add_end_token=False, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) tokens_list.append(cap_tokens) max_length = max(max_length, len(cap_tokens)) all_cap_tokens.append((img, tokens_list)) all_neg_cap_tokens = [] cap_keys = sorted(list(captions.keys())) for img in cap_keys: neg_caps = neg_captions[img] neg_n = len(neg_caps) assert neg_n > 0, 'error: some image has no caption' neg_tokens_list = [] for neg_cap in neg_caps: neg_cap_tokens = tokenize(neg_cap, add_start_token=True, add_end_token=False, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) neg_tokens_list.append(neg_cap_tokens) all_neg_cap_tokens.append((img, neg_tokens_list)) print('Encoding captions') label_arrays = [] label_start_idx = -np.ones(N, dtype=np.int) label_end_idx = -np.ones(N, dtype=np.int) label_length = np.zeros(M, dtype=np.int) caption_counter = 0 counter = 0 # Then encode for img, tokens_list in all_cap_tokens: i = int(img.split('.')[0].split('_')[-1]) n = len(tokens_list) Li = np.zeros((n, max_length), dtype=np.int) for j, tokens in enumerate(tokens_list): label_length[caption_counter] = len(tokens) caption_counter += 1 tokens_encoded = encode(tokens, word_to_idx, allow_unk=args.allow_unk == 1) for k, w in enumerate(tokens_encoded): Li[j, k] = w # captions are padded with zeros label_arrays.append(Li) label_start_idx[i] = counter label_end_idx[i] = counter + n - 1 counter += n L = np.concatenate(label_arrays, axis=0) # put all labels together assert L.shape[0] == M, "lengths don't match?" assert np.all(label_length > 0), 'error: some captions have no word?' print('Encoding negative captions') neg_label_arrays = [] neg_label_start_idx = -np.ones(N, dtype=np.int) neg_label_end_idx = -np.ones(N, dtype=np.int) neg_label_length = np.zeros(M_neg, dtype=np.int) neg_caption_counter = 0 neg_counter = 0 # Then encode for img, tokens_list in all_neg_cap_tokens: i = int(img.split('.')[0].split('_')[-1]) n = len(tokens_list) Li = np.zeros((n, max_length), dtype=np.int) for j, tokens in enumerate(tokens_list): neg_label_length[neg_caption_counter] = len(tokens) neg_caption_counter += 1 tokens_encoded = encode(tokens, word_to_idx, allow_unk=args.allow_unk == 1) for k, w in enumerate(tokens_encoded): Li[j, k] = w # captions are padded with zeros neg_label_arrays.append(Li) neg_label_start_idx[i] = neg_counter neg_label_end_idx[i] = neg_counter + n - 1 neg_counter += n neg_L = np.concatenate(neg_label_arrays, axis=0) # put all labels together assert neg_L.shape[0] == M_neg, "lengths don't match?" assert np.all(neg_label_length > 0), 'error: some captions have no word?' # Create h5 file print('Writing output') print('Encoded captions array size: ', L.shape) print('Encoded negative captions array size: ', neg_L.shape) with h5py.File(args.output_h5, 'w') as f: f.create_dataset('labels', data=L) f.create_dataset('label_start_idx', data=label_start_idx) f.create_dataset('label_end_idx', data=label_end_idx) f.create_dataset('label_length', data=label_length) f.create_dataset('neg_labels', data=neg_L) f.create_dataset('neg_label_start_idx', data=neg_label_start_idx) f.create_dataset('neg_label_end_idx', data=neg_label_end_idx) f.create_dataset('neg_label_length', data=neg_label_length)
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_questions_json, 'r') as f: questions = json.load(f)['questions'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = preprocess_utils.build_vocab( (q['answer'] for q in questions)) question_token_to_idx = preprocess_utils.build_vocab( (q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = preprocess_utils.build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': utils.mkdirs(os.path.dirname(args.output_vocab_json)) with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] for orig_idx, q in enumerate(questions): question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = preprocess_utils.tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = preprocess_utils.encode( question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = preprocess_utils.tokenize(program_str) program_encoded = preprocess_utils.encode( program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) utils.mkdirs(os.path.dirname(args.output_h5_file)) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers))