def from_dataframe(cls, review_df, cutoff=25): """Instantiate the vectorizer from the dataset dataframe Args: review_df (pandas.DataFrame): the review dataset cutoff (int): the parameter for frequency-based filtering Returns: an instance of the ReviewVectorizer """ review_vocab = Vocabulary(add_unk=True) rating_vocab = Vocabulary(add_unk=False) # Add ratings for rating in sorted(set(review_df.rating)): rating_vocab.add_token(rating) # Add top words if count > provided count word_counts = Counter() for review in review_df.review: for word in review.split(" "): if word not in string.punctuation: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: review_vocab.add_token(word) return cls(review_vocab, rating_vocab)
def vocabs_init(train_data: List[InternalParseNode]) -> Dict[str, Vocabulary]: print("Constructing vocabularies...", flush=True) pos_tags_vocab = Vocabulary() pos_tags_vocab.index(START) pos_tags_vocab.index(STOP) pos_tags_vocab.index(TAG_UNK) # words_vocab = Vocabulary() # words_vocab.index(START) # words_vocab.index(STOP) # words_vocab.index(UNK) labels_vocab = Vocabulary() labels_vocab.index(EMPTY_LABEL) for tree in train_data: nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, InternalParseNode): labels_vocab.index(node.label) nodes.extend(reversed(node.children)) else: pos_tags_vocab.index(node.tag) # words_vocab.index(node.word) pos_tags_vocab.freeze() # words_vocab.freeze() labels_vocab.freeze() print('len(pos_tags_vocab): %d\nlen(labels_vocab): %d' % (pos_tags_vocab.size, labels_vocab.size)) return {'pos_tags': pos_tags_vocab, 'labels': labels_vocab}
def setup(self, stage: Optional[str] = None): if not path.exists(path.join(self._dataset_dir, Vocabulary.vocab_file)): Vocabulary.build_from_scratch( path.join(self._dataset_dir, f"{self._config.dataset}.{self._train}.jsonl")) self._vocabulary = Vocabulary(self._config)
def build_vocab(json, threshold): """Build a simple vocabulary wrapper.""" coco = COCO(json) counter = Counter() ids = coco.anns.keys() for i, id in enumerate(ids): caption = str(coco.anns[id]['caption']) tokens = nltk.tokenize.word_tokenize(caption.lower()) counter.update(tokens) if i % 1000 == 0: print("[%d/%d] Tokenized the captions." % (i, len(ids))) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Creates a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Adds the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
def prepare_train_data(config): """ Prepare the data for training the model. """ print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) print("Processing the captions...") annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def dataset(): vocab = Vocabulary(args) dataset = Dataset(args, vocab) source_files = sorted(glob.glob(args.dataset_file_path + 'train_source*.dat')) target_files = sorted(glob.glob(args.dataset_file_path + 'train_target*.dat')) print('========== Begin someting about vocabulary:') print('Vocab Size:', dataset.vocab.vocab_size) print('First 10 Word2cnt:', list(dataset.vocab._word2cnt.items())[:10]) print() print('========== Begin someting about dataset:') X_lens = [len(sen.split()) for source_file in source_files for sen in open(source_file)] y_lens = [len(sen.split()) for target_file in target_files for sen in open(target_file)] print('Number of Source Sentences:', len(X_lens)) print('Number of Sarget Sentences:', len(y_lens)) print() print('Mean Length of Source Sentences:', np.mean(X_lens)) print('Max Length of Source Sentences:', np.max(X_lens)) print('Min Length of Source Sentences:', np.min(X_lens)) print() print('Mean Length of Target Sentences:', np.mean(y_lens)) print('Max Length of Target Sentences:', np.max(y_lens)) print('Min Length of Target Sentences:', np.min(y_lens)) print()
def _counters_to_vocab(config: Dict, counters: Dict[str, TypeCounter[str]]) -> Vocabulary: additional_tokens = [SOS, EOS, PAD, UNK ] if config["token"]["is_wrapped"] else [PAD, UNK] token_to_id = _counter_to_dict(counters["token"], config["token"]["vocabulary_size"], additional_tokens) additional_targets = [SOS, EOS, PAD, UNK ] if config["target"]["is_wrapped"] else [PAD, UNK] label_to_id = _counter_to_dict(counters["target"], config["target"]["vocabulary_size"], additional_targets) additional_nodes = [SOS, EOS, PAD, UNK ] if config["path"]["is_wrapped"] else [PAD, UNK] node_to_id = _counter_to_dict(counters["path"], config["path"]["vocabulary_size"], additional_nodes) vocabulary = Vocabulary(token_to_id, node_to_id, label_to_id) if "type" in counters: additional_types = [SOS, EOS, PAD, UNK ] if config["type"]["is_wrapped"] else [PAD, UNK] vocabulary.type_to_id = _counter_to_dict( counters["type"], config["type"]["vocabulary_size"], additional_types) return vocabulary
def prepare_eval_data(config): """ Prepare the data for evaluating the model. """ coco = COCO(config.eval_caption_file) image_ids = list(coco.imgs.keys()) image_files = [ os.path.join(config.eval_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] print("Building the vocabulary...") if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) print("Building the dataset...") if (config.eval_data_count_limit > 0): print("-----------------------------------------------") print("Restricting Sz:\t", config.eval_data_count_limit) print("Batch Sz:\t", config.batch_size) image_ids = image_ids[0:config.eval_data_count_limit] image_files = image_files[0:config.eval_data_count_limit] """ Dump the image paths to a file """ filepath = 'eval_images.csv' with open(filepath, 'w') as file_handler: for i in range(0, config.eval_data_count_limit): file_handler.write("{}\n".format(image_files[i])) #print(image_files) print("-----------------------------------------------") dataset = DataSet(image_ids, image_files, config.batch_size) print("Dataset built.") return coco, dataset, vocabulary
def prepare_test_data(config, image_path=None): """ Prepare the data for testing the model. """ if image_path is None: files = os.listdir(config.test_image_dir) image_files = [ os.path.join(config.test_image_dir, f) for f in files if f.lower().endswith('.jpg') or f.lower().endswith('.jpeg') ] else: image_files = [image_path] image_ids = list(range(len(image_files))) print("Building the vocabulary...") if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size) print("Dataset built.") return dataset, vocabulary
def prepare_eval_data(config): """ Prepare the data for evaluating the model. """ coco = COCO(config.eval_caption_file) if config.is_person_model == 'Y': file_data = pd.read_csv(config.person_eval_caption_file) image_ids = file_data['image_id'].values image_files = file_data['image_file'].values else: image_ids = list(coco.imgs.keys()) image_files = [os.path.join(config.eval_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids] print("Building the vocabulary...") if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size) print("Dataset built.") return coco, dataset, vocabulary
def build_vocab(cls, json, tokenized_captions, threshold): print("Building vocabulary") coco = COCO(json) counter = Counter() ids = coco.anns.keys() for i, id in enumerate(ids): """ caption = str(coco.anns[id]['caption']) tokens = CocoDataset.tokenize(caption) """ tokens = tokenized_captions[id] counter.update(tokens) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Creates a vocab wrapper and add some special tokens. vocab = Vocabulary() # Adds the words to the vocabulary. for word in words: vocab.add_word(word) print("Total vocabulary size: %d" % len(vocab)) return vocab
def build_vocabulary(config): """ Build the vocabulary from the training data and save it to a file. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) vocabulary = Vocabulary(config.vocabulary_size) vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) return vocabulary
def prepare_test_data(config): """ Prepare the data for testing the model. """ if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" %(vocabulary.size)) return vocabulary
def prepare_test_data(config): """ Prepare the data for testing the model. """ image_files = [config.test_file_name] image_ids = list(range(len(image_files))) if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) dataset = DataSet(image_ids, image_files, config.batch_size) return dataset, vocabulary
def prepare_train_data(config): """ Prepare the data for training the model. """ vqa = VQA(config.train_answer_file, config.train_question_file) vqa.filter_by_ques_len(config.max_question_length) vqa.filter_by_ans_len(1) print("Reading the questions and answers...") annotations = process_vqa(vqa, 'COCO_train2014', config.train_image_dir, config.temp_train_annotation_file) image_files = annotations['image_file'].values questions = annotations['question'].values question_ids = annotations['question_id'].values answers = annotations['answer'].values print("Questions and answers read.") print("Number of questions = %d" % (len(question_ids))) print("Building the vocabulary...") vocabulary = Vocabulary() if not os.path.exists(config.vocabulary_file): for question in tqdm(questions): vocabulary.add_words(word_tokenize(question)) for answer in tqdm(answers): vocabulary.add_words(word_tokenize(answer)) vocabulary.compute_frequency() vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) config.vocabulary_size = vocabulary.size print("Processing the questions and answers...") if not os.path.exists(config.temp_train_data_file): question_word_idxs, question_lens = process_questions( questions, vocabulary, config) answer_idxs = process_answers(answers, vocabulary) data = { 'question_word_idxs': question_word_idxs, 'question_lens': question_lens, 'answer_idxs': answer_idxs } np.save(config.temp_train_data_file, data) else: data = np.load(config.temp_train_data_file).item() question_word_idxs = data['question_word_idxs'] question_lens = data['question_lens'] answer_idxs = data['answer_idxs'] print("Questions and answers processed.") print("Building the dataset...") dataset = DataSet(image_files, question_word_idxs, question_lens, question_ids, config.batch_size, answer_idxs, True, True) print("Dataset built.") return dataset, config
def build_vocabulary(config, max_ann_num=None): """ Build the vocabulary from the training data and save it to a file. """ coco = COCO(config.train_caption_file, config.max_train_ann_num) coco.filter_by_cap_len(config.max_caption_length) vocabulary = Vocabulary(config.vocabulary_size) if not config.max_train_ann_num: vocabulary.build(coco.all_captions()) else: vocabulary.build((coco.all_captions())[:config.max_train_ann_num]) vocabulary.save(config.vocabulary_file) return vocabulary
def __init__(self, args): self.args = args train = DataLoader(self.args.trainpath) dev = DataLoader(self.args.devpath) self.train_words, self.train_poss, self.train_chunks, self.train_labels = train.get_all_train_tokens( ) self.train_max_sentence_len, self.train_max_word_len = train.get_required_max_len( ) self.dev_words, self.dev_poss, self.dev_chunks, self.dev_labels = dev.get_all_train_tokens( ) self.dev_max_sentence_len, self.dev_max_word_len = dev.get_required_max_len( ) vocabulary = Vocabulary(self.train_words) self.vocab = vocabulary.get_word_vocab() self.char_vocab = vocabulary.get_char_vocab() self.train_vect = Vectorizer(self.train_max_sentence_len, self.train_max_word_len, self.vocab, self.char_vocab, self.train_words) self.dev_vect = Vectorizer(self.train_max_sentence_len, self.train_max_word_len, self.vocab, self.char_vocab, self.dev_words) self.poss_vect = LabelEncoderModel(self.train_poss, self.train_max_sentence_len) self.chunks_vect = LabelEncoderModel(self.train_chunks, self.train_max_sentence_len) self.labels_vect = LabelEncoderModel(self.train_labels, self.train_max_sentence_len) #st wrong here self.pos_emb_weights = self.poss_vect.get_emb_weights() self.chunk_emb_weights = self.chunks_vect.get_emb_weights() self.word_emb_weights, self.word_emb_dimensions = PretrainedEmbedder( self.vocab, self.args.pretrained_path).pretrained_embedder() self.model = ModelTraining( self.args.dropout, self.args.lr, len(set(sum(self.train_labels, []))), len(self.vocab), len(self.char_vocab), self.train_max_word_len, len(set(sum(self.train_poss, []))), len(set(sum(self.train_chunks, []))), word_emb_dimensions=self.word_emb_dimensions, word_emb_weights=self.word_emb_weights, pos_emb_weights=self.pos_emb_weights, chunk_emb_weights=self.chunk_emb_weights).model_build()
def build_vocabulary(config, captions, oracle_file): print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols) if True: #not os.path.exists(config.vocabulary_file): vocabulary.build(captions) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) #print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) #return vocabulary print("NUM CAPTIONS: " + str(len(captions))) if not os.path.exists(config.temp_data_file): word_idxs = [] sent_lens = [] for caption in captions: current_word_idxs, current_length = vocabulary.process_sentence( caption) current_num_words = min(config.max_caption_length - 2, current_length) pad_length = config.max_caption_length - current_length - 2 current_word_idxs = [config._START_ ] + current_word_idxs[:current_num_words] + [ config._END_ ] + [config._PAD_] * pad_length word_idxs.append(current_word_idxs) sent_lens.append(current_num_words + 2) word_idxs = np.array(word_idxs) data = {'word_idxs': word_idxs, 'sentence_len': sent_lens} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] sent_lens = data['sentence_len'] if oracle_file is not None: with open(oracle_file, 'w') as outfile: paras = "" for line in word_idxs: for word in line: paras += (str(word) + ' ') paras += '\n' outfile.write(paras) return vocabulary
def main(): word2cnt = Vocabulary() args = get_args() i = -1 for line in open(args.input_files.strip(), 'r').readlines(): # for line in fileinput.input(): i += 1 if i % 100 == 0: print(i) if i % 1000 == 0: word2cnt.save(args.output_prefix.strip() + "_partial") with open(line.rstrip(), 'r') as f: for line in f.readlines(): for word in line.strip().split(): word2cnt.observe_word(word) word2cnt.save(args.output_prefix.strip())
def prepare_eval_data(config): """ Prepare the data for evaluating the model. """ vqa = VQA(config.eval_answer_file, config.eval_question_file) vqa.filter_by_ques_len(config.max_question_length) vqa.filter_by_ans_len(1) print("Reading the questions...") annotations = process_vqa(vqa, 'COCO_val2014', config.eval_image_dir, config.temp_eval_annotation_file) image_files = annotations['image_file'].values questions = annotations['question'].values question_ids = annotations['question_id'].values print("Questions read.") print("Number of questions = %d" % (len(question_ids))) print("Building the vocabulary...") if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) config.vocabulary_size = vocabulary.size print("Processing the questions...") if not os.path.exists(config.temp_eval_data_file): question_word_idxs, question_lens = process_questions( questions, vocabulary, config) data = { 'question_word_idxs': question_word_idxs, 'question_lens': question_lens } np.save(config.temp_eval_data_file, data) else: data = np.load(config.temp_eval_data_file).item() question_word_idxs = data['question_word_idxs'] question_lens = data['question_lens'] print("Questions processed.") print("Building the dataset...") dataset = DataSet(image_files, question_word_idxs, question_lens, question_ids, config.batch_size) print("Dataset built.") return vqa, dataset, vocabulary, config
def __init__(self, args): self.args = args self.q1s, self.q2s, self.labels = PreprocessData().build_corpus( self.args.trainpath) self.q1s_dev, self.q2s_dev, self.labels_dev = PreprocessData( ).build_corpus(self.args.devpath) self.vocab = Vocabulary(self.q1s, self.q2s).get_vocab() self.vect = Vectorizer(self.vocab, self.args.pad) self.emb_weights, self.num_dimensions = Embedder( self.vocab, self.args.pretrained_path, self.args.use_w2v, self.args.num_dimensions).embedder() self.lstm = LSTMModel(self.args.dropout, self.args.use_bi, self.args.hidden_dim, self.args.pad, len(self.vocab), self.num_dimensions, self.emb_weights, self.args.trainable, self.args.use_pool, self.args.first_dense_dim, self.args.lr).get_lstm_model()
def build_vocabulary(config): """ Build the vocabulary from the training data and save it to a file. """ vqa = VQA(config.train_answer_file, config.train_question_file) vqa.filter_by_ques_len(config.max_question_length) vqa.filter_by_ans_len(1) question_ids = list(vqa.qa.keys()) questions = [vqa.qqa[k]['question'] for k in question_ids] answers = [vqa.qa[k]['best_answer'] for k in question_ids] vocabulary = Vocabulary() for question in tqdm(questions): vocabulary.add_words(word_tokenize(question)) for answer in tqdm(answers): vocabulary.add_words(word_tokenize(answer)) vocabulary.compute_frequency() vocabulary.save(config.vocabulary_file) return vocabulary
def __init__(self, root, split, vocabulary='./utils/vocabulary.txt', transform=None): self.root = root self.split = split with open(os.path.join(self.root, 'talk2car_w_rpn_no_duplicates.json'), 'rb') as f: data = json.load(f)[self.split] self.data = {int(k): v for k, v in data.items()} # Map to int self.img_dir = os.path.join(self.root, 'images') self.transform = transform self.vocabulary = Vocabulary(vocabulary) if self.split in ['val', 'train']: self.add_train_annos = True # Add extra info when reading out items for training else: self.add_train_annos = False self.ignore_index = 255 # Ignore index when all RPNs < 0.5 IoU self.num_rpns_per_image = 8 # We only use 32 RPN per image # self.text_encoder = SentenceTransformer('bert-base-nli-stsb-mean-tokens') # Filter out rpns we are not going to use # RPNS were obtained from center after soft NMS # We order the scores, and take the top k. assert (self.num_rpns_per_image < 65) rpns = {k: sample['centernet'] for k, sample in self.data.items()} rpns_score_ordered_idx = { k: np.argsort([rpn['score'] for rpn in v]) for k, v in rpns.items() } rpns = { k: [ v[idx] for idx in rpns_score_ordered_idx[k][-self.num_rpns_per_image:] ] for k, v in rpns.items() } for k in self.data.keys(): self.data[k]['centernet'] = rpns[k]
def create_train_model(hparams, model_creator, scope=None): """Create train graph, model, and iterator.""" print("# Creating TrainModel...") src_train_file = "%s/%s.%s" % (hparams.data_dir, hparams.train_prefix, hparams.src_suffix) tgt_train_file = "%s/%s.%s" % (hparams.data_dir, hparams.train_prefix, hparams.tgt_suffix) src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix) tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix) batch_size = hparams.batch_size num_buckets = hparams.num_buckets graph = tf.Graph() with graph.as_default(), tf.container(scope or "train"): skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64) vocabulary = Vocabulary(src_vocab_file=src_vocab_file, tgt_vocab_file=tgt_vocab_file) iterator = TrainIterator(vocabulary=vocabulary, src_data_file=src_train_file, tgt_data_file=tgt_train_file, batch_size=batch_size, num_buckets=num_buckets, skip_count=skip_count_placeholder) assert isinstance(hparams, tf_training.HParams) model_params = get_model_params(hparams=hparams, vocabulary=vocabulary, iterator=iterator) model_params.add_hparam('mode', ModeKeys.TRAIN) model = model_creator(**model_params.values()) return TrainModel(graph=graph, model=model, iterator=iterator, skip_count_placeholder=skip_count_placeholder)
def prepare_eval_data(config): """ Prepare the data for evaluating the model. """ coco = COCO(config.eval_caption_file, config.max_eval_ann_num) image_ids = [] image_files = [] if not config.max_eval_ann_num: print('No config.max_eval_ann_num') image_ids = list(coco.imgs.keys()) image_files = [ os.path.join(config.eval_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] else: print('config.max_eval_ann_num=', config.max_eval_ann_num) image_ids = [ coco.anns[ann_id]['image_id'] for ann_id in islice(coco.anns, 0, config.max_eval_ann_num) ] image_files = [ os.path.join(config.eval_image_dir, coco.imgs[image_id]['file_name']) for image_id in islice(image_ids, 0, config.max_eval_ann_num) ] print("Building the vocabulary...") if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) print('Download Images') coco.download(config.eval_image_dir, image_ids) print('Finished download images') print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size) print("Dataset built.") return coco, dataset, vocabulary
def prepare_eval_new_data(caption_file, image_dir, config): """ Prepare the data for evaluating the model with new dataset. """ coco = COCO(caption_file) image_ids = list(coco.imgs.keys()) image_files = [ os.path.join(image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] print("Building the vocabulary...") if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size) print("Dataset built.") return coco, dataset, vocabulary
def create_eval_model(hparams, model_creator, scope=None): """Create eval graph, model, src/tgt file holders, and iterator.""" print("# Creating EvalModel...") src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix) tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix) batch_size = hparams.batch_size num_buckets = hparams.num_buckets graph = tf.Graph() with graph.as_default(), tf.container(scope or "eval"): src_eval_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) tgt_eval_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) vocabulary = Vocabulary(src_vocab_file=src_vocab_file, tgt_vocab_file=tgt_vocab_file) iterator = EvalIterator(vocabulary=vocabulary, src_data_file=src_eval_file_placeholder, tgt_data_file=tgt_eval_file_placeholder, batch_size=batch_size, num_buckets=num_buckets) assert isinstance(hparams, tf_training.HParams) model_params = get_model_params(hparams=hparams, vocabulary=vocabulary, iterator=iterator) model_params.add_hparam('mode', ModeKeys.EVAL) model = model_creator(**model_params.values()) return EvalModel(graph=graph, model=model, src_file_placeholder=src_eval_file_placeholder, tgt_file_placeholder=tgt_eval_file_placeholder, iterator=iterator)
def prepare_train_data(config): """ Prepare the data for training the model. """ if not os.path.exists(config.prepare_annotation_dir): os.mkdir(config.prepare_annotation_dir) coco = COCO(config, config.train_caption_file, config.val_caption_file) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): coco.filter_by_cap_len(config.max_caption_length) vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) vocabulary.save_counts(config.word_count_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" %(vocabulary.size)) print("Processing the captions...") if not os.path.exists(config.train_csv_file): coco.filter_by_words(set(vocabulary.words)) captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.dataset_image_dir, 'train' if coco.imgs[image_id]['file_name'].find('train2014')>=0 else 'val', coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'caption': captions}) annotations.to_csv(config.train_csv_file) else: annotations = pd.read_csv(config.train_csv_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values
def prepare_eval_data(config): """ Prepare the data for evaluating the model. """ coco = COCO(config.eval_caption_file, config.ignore_file_eval) image_ids = list(coco.imgs.keys()) image_files = [os.path.join(config.eval_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids] print("IMAGE FILES SHAPE PREP DATA " + str(len(image_files))) print("Building the vocabulary...") if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols, config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" %(vocabulary.size)) print("Building the dataset...") dataset = DataSet(coco, vocabulary, image_ids, image_files, config.batch_size) print("Dataset built.") return dataset
def create_infer_model(hparams, model_creator, scope=None): """Create inference model.""" print("# Creating InferModel...") src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix) tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix) graph = tf.Graph() with graph.as_default(), tf.container(scope or "infer"): src_data_placeholder = tf.placeholder(shape=[None], dtype=tf.string) batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64) vocabulary = Vocabulary(src_vocab_file=src_vocab_file, tgt_vocab_file=tgt_vocab_file) iterator = InferIterator(vocabulary=vocabulary, src_data=src_data_placeholder, batch_size=batch_size_placeholder) assert isinstance(hparams, tf_training.HParams) model_params = get_model_params(hparams=hparams, vocabulary=vocabulary, iterator=iterator) model_params.add_hparam('mode', ModeKeys.INFER) model = model_creator(**model_params.values()) return InferModel(graph=graph, model=model, src_data_placeholder=src_data_placeholder, batch_size_placeholder=batch_size_placeholder, iterator=iterator)