예제 #1
0
    def from_dataframe(cls, review_df, cutoff=25):
        """Instantiate the vectorizer from the dataset dataframe

        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        # Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        # Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)
예제 #2
0
def vocabs_init(train_data: List[InternalParseNode]) -> Dict[str, Vocabulary]:
    print("Constructing vocabularies...", flush=True)

    pos_tags_vocab = Vocabulary()
    pos_tags_vocab.index(START)
    pos_tags_vocab.index(STOP)
    pos_tags_vocab.index(TAG_UNK)

    # words_vocab = Vocabulary()
    # words_vocab.index(START)
    # words_vocab.index(STOP)
    # words_vocab.index(UNK)

    labels_vocab = Vocabulary()
    labels_vocab.index(EMPTY_LABEL)

    for tree in train_data:
        nodes = [tree]
        while nodes:
            node = nodes.pop()
            if isinstance(node, InternalParseNode):
                labels_vocab.index(node.label)
                nodes.extend(reversed(node.children))
            else:
                pos_tags_vocab.index(node.tag)
                # words_vocab.index(node.word)

    pos_tags_vocab.freeze()
    # words_vocab.freeze()
    labels_vocab.freeze()

    print('len(pos_tags_vocab): %d\nlen(labels_vocab): %d' %
          (pos_tags_vocab.size, labels_vocab.size))

    return {'pos_tags': pos_tags_vocab, 'labels': labels_vocab}
 def setup(self, stage: Optional[str] = None):
     if not path.exists(path.join(self._dataset_dir,
                                  Vocabulary.vocab_file)):
         Vocabulary.build_from_scratch(
             path.join(self._dataset_dir,
                       f"{self._config.dataset}.{self._train}.jsonl"))
     self._vocabulary = Vocabulary(self._config)
예제 #4
0
def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if i % 1000 == 0:
            print("[%d/%d] Tokenized the captions." % (i, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Creates a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Adds the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
예제 #5
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Processing the captions...")

    annotations = pd.read_csv(config.temp_annotation_file)
    captions = annotations['caption'].values
    image_ids = annotations['image_id'].values
    image_files = annotations['image_file'].values

    data = np.load(config.temp_data_file).item()
    word_idxs = data['word_idxs']
    masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
예제 #6
0
파일: preview.py 프로젝트: Xenia-W/RNN
def dataset():
    vocab = Vocabulary(args)
    dataset = Dataset(args, vocab)
    source_files = sorted(glob.glob(args.dataset_file_path + 'train_source*.dat'))
    target_files = sorted(glob.glob(args.dataset_file_path + 'train_target*.dat'))

    print('========== Begin someting about vocabulary:')
    print('Vocab Size:', dataset.vocab.vocab_size)
    print('First 10 Word2cnt:', list(dataset.vocab._word2cnt.items())[:10])
    print()

    print('========== Begin someting about dataset:')
    X_lens = [len(sen.split()) for source_file in source_files for sen in open(source_file)]
    y_lens = [len(sen.split()) for target_file in target_files for sen in open(target_file)]
    print('Number of Source Sentences:', len(X_lens))
    print('Number of Sarget Sentences:', len(y_lens))
    print()
    print('Mean Length of Source Sentences:', np.mean(X_lens))
    print('Max Length of Source Sentences:', np.max(X_lens))
    print('Min Length of Source Sentences:', np.min(X_lens))
    print()
    print('Mean Length of Target Sentences:', np.mean(y_lens))
    print('Max Length of Target Sentences:', np.max(y_lens))
    print('Min Length of Target Sentences:', np.min(y_lens))
    print()
예제 #7
0
def _counters_to_vocab(config: Dict,
                       counters: Dict[str, TypeCounter[str]]) -> Vocabulary:
    additional_tokens = [SOS, EOS, PAD, UNK
                         ] if config["token"]["is_wrapped"] else [PAD, UNK]
    token_to_id = _counter_to_dict(counters["token"],
                                   config["token"]["vocabulary_size"],
                                   additional_tokens)
    additional_targets = [SOS, EOS, PAD, UNK
                          ] if config["target"]["is_wrapped"] else [PAD, UNK]
    label_to_id = _counter_to_dict(counters["target"],
                                   config["target"]["vocabulary_size"],
                                   additional_targets)
    additional_nodes = [SOS, EOS, PAD, UNK
                        ] if config["path"]["is_wrapped"] else [PAD, UNK]
    node_to_id = _counter_to_dict(counters["path"],
                                  config["path"]["vocabulary_size"],
                                  additional_nodes)

    vocabulary = Vocabulary(token_to_id, node_to_id, label_to_id)
    if "type" in counters:
        additional_types = [SOS, EOS, PAD, UNK
                            ] if config["type"]["is_wrapped"] else [PAD, UNK]
        vocabulary.type_to_id = _counter_to_dict(
            counters["type"], config["type"]["vocabulary_size"],
            additional_types)
    return vocabulary
예제 #8
0
def prepare_eval_data(config):
    """ Prepare the data for evaluating the model. """
    coco = COCO(config.eval_caption_file)
    image_ids = list(coco.imgs.keys())
    image_files = [
        os.path.join(config.eval_image_dir, coco.imgs[image_id]['file_name'])
        for image_id in image_ids
    ]

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Building the dataset...")
    if (config.eval_data_count_limit > 0):
        print("-----------------------------------------------")
        print("Restricting Sz:\t", config.eval_data_count_limit)
        print("Batch Sz:\t", config.batch_size)
        image_ids = image_ids[0:config.eval_data_count_limit]
        image_files = image_files[0:config.eval_data_count_limit]
        """ Dump the image paths to a file """
        filepath = 'eval_images.csv'
        with open(filepath, 'w') as file_handler:
            for i in range(0, config.eval_data_count_limit):
                file_handler.write("{}\n".format(image_files[i]))
        #print(image_files)
        print("-----------------------------------------------")
    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return coco, dataset, vocabulary
예제 #9
0
def prepare_test_data(config, image_path=None):
    """ Prepare the data for testing the model. """
    if image_path is None:
        files = os.listdir(config.test_image_dir)
        image_files = [
            os.path.join(config.test_image_dir, f) for f in files
            if f.lower().endswith('.jpg') or f.lower().endswith('.jpeg')
        ]
    else:
        image_files = [image_path]

    image_ids = list(range(len(image_files)))

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return dataset, vocabulary
예제 #10
0
def prepare_eval_data(config):
    """ Prepare the data for evaluating the model. """
    coco = COCO(config.eval_caption_file)

    if config.is_person_model == 'Y':
        file_data = pd.read_csv(config.person_eval_caption_file)
        image_ids = file_data['image_id'].values
        image_files = file_data['image_file'].values
    else:
        image_ids = list(coco.imgs.keys())
        image_files = [os.path.join(config.eval_image_dir,
                                    coco.imgs[image_id]['file_name'])
                       for image_id in image_ids]

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size,
                                config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Building the dataset...")

    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return coco, dataset, vocabulary
    def build_vocab(cls, json, tokenized_captions, threshold):
        print("Building vocabulary")
        coco = COCO(json)
        counter = Counter()
        ids = coco.anns.keys()
        for i, id in enumerate(ids):
            """
            caption = str(coco.anns[id]['caption'])
            tokens = CocoDataset.tokenize(caption)
            """
            tokens = tokenized_captions[id]
            counter.update(tokens)

        # If the word frequency is less than 'threshold', then the word is discarded.
        words = [word for word, cnt in counter.items() if cnt >= threshold]

        # Creates a vocab wrapper and add some special tokens.
        vocab = Vocabulary()

        # Adds the words to the vocabulary.
        for word in words:
            vocab.add_word(word)

        print("Total vocabulary size: %d" % len(vocab))
        return vocab
예제 #12
0
def build_vocabulary(config):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    vocabulary.build(coco.all_captions())
    vocabulary.save(config.vocabulary_file)
    return vocabulary
예제 #13
0
def prepare_test_data(config):
    """ Prepare the data for testing the model. """
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size,
                                config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" %(vocabulary.size))
    return vocabulary
예제 #14
0
파일: dataset.py 프로젝트: shubham1172/VQA
def prepare_test_data(config):
    """ Prepare the data for testing the model. """
    image_files = [config.test_file_name]
    image_ids = list(range(len(image_files)))
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    dataset = DataSet(image_ids, image_files, config.batch_size)
    return dataset, vocabulary
예제 #15
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    vqa = VQA(config.train_answer_file, config.train_question_file)
    vqa.filter_by_ques_len(config.max_question_length)
    vqa.filter_by_ans_len(1)

    print("Reading the questions and answers...")
    annotations = process_vqa(vqa, 'COCO_train2014', config.train_image_dir,
                              config.temp_train_annotation_file)

    image_files = annotations['image_file'].values
    questions = annotations['question'].values
    question_ids = annotations['question_id'].values
    answers = annotations['answer'].values
    print("Questions and answers read.")
    print("Number of questions = %d" % (len(question_ids)))

    print("Building the vocabulary...")
    vocabulary = Vocabulary()
    if not os.path.exists(config.vocabulary_file):
        for question in tqdm(questions):
            vocabulary.add_words(word_tokenize(question))
        for answer in tqdm(answers):
            vocabulary.add_words(word_tokenize(answer))
        vocabulary.compute_frequency()
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))
    config.vocabulary_size = vocabulary.size

    print("Processing the questions and answers...")
    if not os.path.exists(config.temp_train_data_file):
        question_word_idxs, question_lens = process_questions(
            questions, vocabulary, config)
        answer_idxs = process_answers(answers, vocabulary)
        data = {
            'question_word_idxs': question_word_idxs,
            'question_lens': question_lens,
            'answer_idxs': answer_idxs
        }
        np.save(config.temp_train_data_file, data)
    else:
        data = np.load(config.temp_train_data_file).item()
        question_word_idxs = data['question_word_idxs']
        question_lens = data['question_lens']
        answer_idxs = data['answer_idxs']
    print("Questions and answers processed.")

    print("Building the dataset...")
    dataset = DataSet(image_files, question_word_idxs, question_lens,
                      question_ids, config.batch_size, answer_idxs, True, True)
    print("Dataset built.")
    return dataset, config
예제 #16
0
def build_vocabulary(config, max_ann_num=None):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file, config.max_train_ann_num)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    if not config.max_train_ann_num:
        vocabulary.build(coco.all_captions())
    else:
        vocabulary.build((coco.all_captions())[:config.max_train_ann_num])
    vocabulary.save(config.vocabulary_file)
    return vocabulary
예제 #17
0
    def __init__(self, args):
        self.args = args
        train = DataLoader(self.args.trainpath)
        dev = DataLoader(self.args.devpath)

        self.train_words, self.train_poss, self.train_chunks, self.train_labels = train.get_all_train_tokens(
        )
        self.train_max_sentence_len, self.train_max_word_len = train.get_required_max_len(
        )
        self.dev_words, self.dev_poss, self.dev_chunks, self.dev_labels = dev.get_all_train_tokens(
        )
        self.dev_max_sentence_len, self.dev_max_word_len = dev.get_required_max_len(
        )

        vocabulary = Vocabulary(self.train_words)
        self.vocab = vocabulary.get_word_vocab()
        self.char_vocab = vocabulary.get_char_vocab()

        self.train_vect = Vectorizer(self.train_max_sentence_len,
                                     self.train_max_word_len, self.vocab,
                                     self.char_vocab, self.train_words)
        self.dev_vect = Vectorizer(self.train_max_sentence_len,
                                   self.train_max_word_len, self.vocab,
                                   self.char_vocab, self.dev_words)

        self.poss_vect = LabelEncoderModel(self.train_poss,
                                           self.train_max_sentence_len)
        self.chunks_vect = LabelEncoderModel(self.train_chunks,
                                             self.train_max_sentence_len)
        self.labels_vect = LabelEncoderModel(self.train_labels,
                                             self.train_max_sentence_len)

        #st wrong here
        self.pos_emb_weights = self.poss_vect.get_emb_weights()
        self.chunk_emb_weights = self.chunks_vect.get_emb_weights()
        self.word_emb_weights, self.word_emb_dimensions = PretrainedEmbedder(
            self.vocab, self.args.pretrained_path).pretrained_embedder()
        self.model = ModelTraining(
            self.args.dropout,
            self.args.lr,
            len(set(sum(self.train_labels, []))),
            len(self.vocab),
            len(self.char_vocab),
            self.train_max_word_len,
            len(set(sum(self.train_poss, []))),
            len(set(sum(self.train_chunks, []))),
            word_emb_dimensions=self.word_emb_dimensions,
            word_emb_weights=self.word_emb_weights,
            pos_emb_weights=self.pos_emb_weights,
            chunk_emb_weights=self.chunk_emb_weights).model_build()
예제 #18
0
def build_vocabulary(config, captions, oracle_file):
    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    if True:  #not os.path.exists(config.vocabulary_file):
        vocabulary.build(captions)
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    #print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))
    #return vocabulary

    print("NUM CAPTIONS: " + str(len(captions)))
    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        sent_lens = []
        for caption in captions:
            current_word_idxs, current_length = vocabulary.process_sentence(
                caption)
            current_num_words = min(config.max_caption_length - 2,
                                    current_length)

            pad_length = config.max_caption_length - current_length - 2
            current_word_idxs = [config._START_
                                 ] + current_word_idxs[:current_num_words] + [
                                     config._END_
                                 ] + [config._PAD_] * pad_length

            word_idxs.append(current_word_idxs)
            sent_lens.append(current_num_words + 2)
        word_idxs = np.array(word_idxs)
        data = {'word_idxs': word_idxs, 'sentence_len': sent_lens}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        sent_lens = data['sentence_len']

    if oracle_file is not None:
        with open(oracle_file, 'w') as outfile:
            paras = ""
            for line in word_idxs:
                for word in line:
                    paras += (str(word) + ' ')
                paras += '\n'
                outfile.write(paras)

    return vocabulary
예제 #19
0
def main():
    word2cnt = Vocabulary()
    args = get_args()
    i = -1
    for line in open(args.input_files.strip(), 'r').readlines():
        #  for line in fileinput.input():
        i += 1
        if i % 100 == 0:
            print(i)
            if i % 1000 == 0:
                word2cnt.save(args.output_prefix.strip() + "_partial")
        with open(line.rstrip(), 'r') as f:
            for line in f.readlines():
                for word in line.strip().split():
                    word2cnt.observe_word(word)
    word2cnt.save(args.output_prefix.strip())
예제 #20
0
def prepare_eval_data(config):
    """ Prepare the data for evaluating the model. """
    vqa = VQA(config.eval_answer_file, config.eval_question_file)
    vqa.filter_by_ques_len(config.max_question_length)
    vqa.filter_by_ans_len(1)

    print("Reading the questions...")
    annotations = process_vqa(vqa, 'COCO_val2014', config.eval_image_dir,
                              config.temp_eval_annotation_file)

    image_files = annotations['image_file'].values
    questions = annotations['question'].values
    question_ids = annotations['question_id'].values
    print("Questions read.")
    print("Number of questions = %d" % (len(question_ids)))

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))
    config.vocabulary_size = vocabulary.size

    print("Processing the questions...")
    if not os.path.exists(config.temp_eval_data_file):
        question_word_idxs, question_lens = process_questions(
            questions, vocabulary, config)
        data = {
            'question_word_idxs': question_word_idxs,
            'question_lens': question_lens
        }
        np.save(config.temp_eval_data_file, data)
    else:
        data = np.load(config.temp_eval_data_file).item()
        question_word_idxs = data['question_word_idxs']
        question_lens = data['question_lens']
    print("Questions processed.")

    print("Building the dataset...")
    dataset = DataSet(image_files, question_word_idxs, question_lens,
                      question_ids, config.batch_size)
    print("Dataset built.")
    return vqa, dataset, vocabulary, config
예제 #21
0
    def __init__(self, args):
        self.args = args

        self.q1s, self.q2s, self.labels = PreprocessData().build_corpus(
            self.args.trainpath)
        self.q1s_dev, self.q2s_dev, self.labels_dev = PreprocessData(
        ).build_corpus(self.args.devpath)
        self.vocab = Vocabulary(self.q1s, self.q2s).get_vocab()
        self.vect = Vectorizer(self.vocab, self.args.pad)
        self.emb_weights, self.num_dimensions = Embedder(
            self.vocab, self.args.pretrained_path, self.args.use_w2v,
            self.args.num_dimensions).embedder()
        self.lstm = LSTMModel(self.args.dropout, self.args.use_bi,
                              self.args.hidden_dim, self.args.pad,
                              len(self.vocab), self.num_dimensions,
                              self.emb_weights, self.args.trainable,
                              self.args.use_pool, self.args.first_dense_dim,
                              self.args.lr).get_lstm_model()
예제 #22
0
def build_vocabulary(config):
    """ Build the vocabulary from the training data and save it to a file. """
    vqa = VQA(config.train_answer_file, config.train_question_file)
    vqa.filter_by_ques_len(config.max_question_length)
    vqa.filter_by_ans_len(1)

    question_ids = list(vqa.qa.keys())
    questions = [vqa.qqa[k]['question'] for k in question_ids]
    answers = [vqa.qa[k]['best_answer'] for k in question_ids]

    vocabulary = Vocabulary()
    for question in tqdm(questions):
        vocabulary.add_words(word_tokenize(question))
    for answer in tqdm(answers):
        vocabulary.add_words(word_tokenize(answer))
    vocabulary.compute_frequency()
    vocabulary.save(config.vocabulary_file)
    return vocabulary
예제 #23
0
    def __init__(self,
                 root,
                 split,
                 vocabulary='./utils/vocabulary.txt',
                 transform=None):
        self.root = root
        self.split = split

        with open(os.path.join(self.root, 'talk2car_w_rpn_no_duplicates.json'),
                  'rb') as f:
            data = json.load(f)[self.split]
            self.data = {int(k): v for k, v in data.items()}  # Map to int
        self.img_dir = os.path.join(self.root, 'images')
        self.transform = transform
        self.vocabulary = Vocabulary(vocabulary)

        if self.split in ['val', 'train']:
            self.add_train_annos = True  # Add extra info when reading out items for training
        else:
            self.add_train_annos = False

        self.ignore_index = 255  # Ignore index when all RPNs < 0.5 IoU
        self.num_rpns_per_image = 8  # We only use 32 RPN per image
        # self.text_encoder = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

        # Filter out rpns we are not going to use
        # RPNS were obtained from center after soft NMS
        # We order the scores, and take the top k.
        assert (self.num_rpns_per_image < 65)
        rpns = {k: sample['centernet'] for k, sample in self.data.items()}
        rpns_score_ordered_idx = {
            k: np.argsort([rpn['score'] for rpn in v])
            for k, v in rpns.items()
        }
        rpns = {
            k: [
                v[idx]
                for idx in rpns_score_ordered_idx[k][-self.num_rpns_per_image:]
            ]
            for k, v in rpns.items()
        }
        for k in self.data.keys():
            self.data[k]['centernet'] = rpns[k]
예제 #24
0
def create_train_model(hparams, model_creator, scope=None):
    """Create train graph, model, and iterator."""
    print("# Creating TrainModel...")

    src_train_file = "%s/%s.%s" % (hparams.data_dir, hparams.train_prefix,
                                   hparams.src_suffix)
    tgt_train_file = "%s/%s.%s" % (hparams.data_dir, hparams.train_prefix,
                                   hparams.tgt_suffix)
    src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix,
                                   hparams.src_suffix)
    tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix,
                                   hparams.tgt_suffix)
    batch_size = hparams.batch_size
    num_buckets = hparams.num_buckets

    graph = tf.Graph()

    with graph.as_default(), tf.container(scope or "train"):
        skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64)

        vocabulary = Vocabulary(src_vocab_file=src_vocab_file,
                                tgt_vocab_file=tgt_vocab_file)

        iterator = TrainIterator(vocabulary=vocabulary,
                                 src_data_file=src_train_file,
                                 tgt_data_file=tgt_train_file,
                                 batch_size=batch_size,
                                 num_buckets=num_buckets,
                                 skip_count=skip_count_placeholder)

        assert isinstance(hparams, tf_training.HParams)

        model_params = get_model_params(hparams=hparams,
                                        vocabulary=vocabulary,
                                        iterator=iterator)
        model_params.add_hparam('mode', ModeKeys.TRAIN)

        model = model_creator(**model_params.values())

    return TrainModel(graph=graph,
                      model=model,
                      iterator=iterator,
                      skip_count_placeholder=skip_count_placeholder)
예제 #25
0
def prepare_eval_data(config):
    """ Prepare the data for evaluating the model. """
    coco = COCO(config.eval_caption_file, config.max_eval_ann_num)
    image_ids = []
    image_files = []
    if not config.max_eval_ann_num:
        print('No config.max_eval_ann_num')
        image_ids = list(coco.imgs.keys())
        image_files = [
            os.path.join(config.eval_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
    else:
        print('config.max_eval_ann_num=', config.max_eval_ann_num)
        image_ids = [
            coco.anns[ann_id]['image_id']
            for ann_id in islice(coco.anns, 0, config.max_eval_ann_num)
        ]
        image_files = [
            os.path.join(config.eval_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in islice(image_ids, 0, config.max_eval_ann_num)
        ]

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print('Download Images')
    coco.download(config.eval_image_dir, image_ids)
    print('Finished download images')

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return coco, dataset, vocabulary
예제 #26
0
def prepare_eval_new_data(caption_file, image_dir, config):
    """ Prepare the data for evaluating the model with new dataset. """
    coco = COCO(caption_file)
    image_ids = list(coco.imgs.keys())
    image_files = [
        os.path.join(image_dir, coco.imgs[image_id]['file_name'])
        for image_id in image_ids
    ]

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return coco, dataset, vocabulary
예제 #27
0
def create_eval_model(hparams, model_creator, scope=None):
    """Create eval graph, model, src/tgt file holders, and iterator."""
    print("# Creating EvalModel...")

    src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix,
                                   hparams.src_suffix)
    tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix,
                                   hparams.tgt_suffix)
    batch_size = hparams.batch_size
    num_buckets = hparams.num_buckets

    graph = tf.Graph()

    with graph.as_default(), tf.container(scope or "eval"):
        src_eval_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)
        tgt_eval_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)

        vocabulary = Vocabulary(src_vocab_file=src_vocab_file,
                                tgt_vocab_file=tgt_vocab_file)

        iterator = EvalIterator(vocabulary=vocabulary,
                                src_data_file=src_eval_file_placeholder,
                                tgt_data_file=tgt_eval_file_placeholder,
                                batch_size=batch_size,
                                num_buckets=num_buckets)

        assert isinstance(hparams, tf_training.HParams)

        model_params = get_model_params(hparams=hparams,
                                        vocabulary=vocabulary,
                                        iterator=iterator)
        model_params.add_hparam('mode', ModeKeys.EVAL)

        model = model_creator(**model_params.values())

    return EvalModel(graph=graph,
                     model=model,
                     src_file_placeholder=src_eval_file_placeholder,
                     tgt_file_placeholder=tgt_eval_file_placeholder,
                     iterator=iterator)
예제 #28
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    if not os.path.exists(config.prepare_annotation_dir):
        os.mkdir(config.prepare_annotation_dir)
    coco = COCO(config, config.train_caption_file, config.val_caption_file)
    
    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        coco.filter_by_cap_len(config.max_caption_length)
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
        vocabulary.save_counts(config.word_count_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" %(vocabulary.size))

    
    print("Processing the captions...")
    if not os.path.exists(config.train_csv_file):
                    
        coco.filter_by_words(set(vocabulary.words))
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [ 
            os.path.join(config.dataset_image_dir,
            'train' if coco.imgs[image_id]['file_name'].find('train2014')>=0 else 'val',
            coco.imgs[image_id]['file_name'])
                        for image_id in image_ids ] 
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.train_csv_file)
    else:
        annotations = pd.read_csv(config.train_csv_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values
예제 #29
0
def prepare_eval_data(config):
    """ Prepare the data for evaluating the model. """
    coco = COCO(config.eval_caption_file, config.ignore_file_eval)
    image_ids = list(coco.imgs.keys())
    image_files = [os.path.join(config.eval_image_dir,
                                coco.imgs[image_id]['file_name'])
                                for image_id in image_ids]
    print("IMAGE FILES SHAPE PREP DATA " + str(len(image_files)))
    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size,
                                config.ctrl_symbols,
                                config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" %(vocabulary.size))

    print("Building the dataset...")
    dataset = DataSet(coco, vocabulary, image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return dataset
예제 #30
0
def create_infer_model(hparams, model_creator, scope=None):
    """Create inference model."""
    print("# Creating InferModel...")

    src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix,
                                   hparams.src_suffix)
    tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix,
                                   hparams.tgt_suffix)

    graph = tf.Graph()

    with graph.as_default(), tf.container(scope or "infer"):
        src_data_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
        batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64)

        vocabulary = Vocabulary(src_vocab_file=src_vocab_file,
                                tgt_vocab_file=tgt_vocab_file)

        iterator = InferIterator(vocabulary=vocabulary,
                                 src_data=src_data_placeholder,
                                 batch_size=batch_size_placeholder)

        assert isinstance(hparams, tf_training.HParams)

        model_params = get_model_params(hparams=hparams,
                                        vocabulary=vocabulary,
                                        iterator=iterator)
        model_params.add_hparam('mode', ModeKeys.INFER)

        model = model_creator(**model_params.values())

    return InferModel(graph=graph,
                      model=model,
                      src_data_placeholder=src_data_placeholder,
                      batch_size_placeholder=batch_size_placeholder,
                      iterator=iterator)