def build_vocab(cls, json, tokenized_captions, threshold): print("Building vocabulary") coco = COCO(json) counter = Counter() ids = coco.anns.keys() for i, id in enumerate(ids): """ caption = str(coco.anns[id]['caption']) tokens = CocoDataset.tokenize(caption) """ tokens = tokenized_captions[id] counter.update(tokens) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Creates a vocab wrapper and add some special tokens. vocab = Vocabulary() # Adds the words to the vocabulary. for word in words: vocab.add_word(word) print("Total vocabulary size: %d" % len(vocab)) return vocab
def load_embeddings(self, src_embeddings, tgt_embeddings, vocabulary: Vocabulary): aligned_embeddings = torch.div(torch.randn(vocabulary.size(), 300), 10) found_count = 0 for i in range(len(vocabulary.index2word)): word = vocabulary.get_word(i) language = vocabulary.get_language(i) if language == "src" and word in src_embeddings.wv: aligned_embeddings[i] = torch.FloatTensor( src_embeddings.wv[word]) found_count += 1 elif language == "src" and word.lower() in src_embeddings.wv: aligned_embeddings[i] = torch.FloatTensor( src_embeddings.wv[word.lower()]) found_count += 1 if language == "tgt" and word in tgt_embeddings.wv: aligned_embeddings[i] = torch.FloatTensor( tgt_embeddings.wv[word]) found_count += 1 elif language == "tgt" and word.lower() in tgt_embeddings.wv: aligned_embeddings[i] = torch.FloatTensor( tgt_embeddings.wv[word.lower()]) found_count += 1 logger.info("Embeddings filled: " + str(found_count) + " of " + str(vocabulary.size())) enable_training = self.encoder.embedding.weight.requires_grad self.encoder.embedding.weight = nn.Parameter( aligned_embeddings, requires_grad=enable_training) self.decoder.embedding.weight = nn.Parameter( aligned_embeddings, requires_grad=enable_training)
def setup(self, stage: Optional[str] = None): if not path.exists(path.join(self._dataset_dir, Vocabulary.vocab_file)): Vocabulary.build_from_scratch( path.join(self._dataset_dir, f"{self._config.dataset}.{self._train}.jsonl")) self._vocabulary = Vocabulary(self._config)
def prepare_train_data(config): """ Prepare the data for training the model. """ print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) print("Processing the captions...") annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def main2(): vocabs = [Vocabulary.load("./vocabs/v{0}".format(i)) for i in range(8)] print("loaded vocabs!") master_vocab = Vocabulary.merge_vocabularies(vocabs) master_vocab.save("./vocabs/final_vocab") import pdb pdb.set_trace()
def _counters_to_vocab(config: Dict, counters: Dict[str, TypeCounter[str]]) -> Vocabulary: additional_tokens = [SOS, EOS, PAD, UNK ] if config["token"]["is_wrapped"] else [PAD, UNK] token_to_id = _counter_to_dict(counters["token"], config["token"]["vocabulary_size"], additional_tokens) additional_targets = [SOS, EOS, PAD, UNK ] if config["target"]["is_wrapped"] else [PAD, UNK] label_to_id = _counter_to_dict(counters["target"], config["target"]["vocabulary_size"], additional_targets) additional_nodes = [SOS, EOS, PAD, UNK ] if config["path"]["is_wrapped"] else [PAD, UNK] node_to_id = _counter_to_dict(counters["path"], config["path"]["vocabulary_size"], additional_nodes) vocabulary = Vocabulary(token_to_id, node_to_id, label_to_id) if "type" in counters: additional_types = [SOS, EOS, PAD, UNK ] if config["type"]["is_wrapped"] else [PAD, UNK] vocabulary.type_to_id = _counter_to_dict( counters["type"], config["type"]["vocabulary_size"], additional_types) return vocabulary
def init_mapping(bi_dict_filename: str, vocabulary: Vocabulary, first_lang, second_lang): mapping = defaultdict(set) with open(bi_dict_filename, "r", encoding='utf-8') as r: for line in r: first_word, second_word = line.strip().split() first_index = vocabulary.get_unk(first_lang) if vocabulary.has_word(first_word, first_lang): first_index = vocabulary.get_index(first_word, first_lang) elif vocabulary.has_word(first_word.capitalize(), first_lang): first_index = vocabulary.get_index(first_word.capitalize(), first_lang) second_index = vocabulary.get_unk(second_lang) if vocabulary.has_word(second_word, second_lang): second_index = vocabulary.get_index( second_word, second_lang) elif vocabulary.has_word(second_word.capitalize(), second_lang): second_index = vocabulary.get_index( second_word.capitalize(), second_lang) mapping[first_index].add(second_index) return mapping
def build_vocab(json, threshold): """Build a simple vocabulary wrapper.""" coco = COCO(json) counter = Counter() ids = coco.anns.keys() for i, id in enumerate(ids): caption = str(coco.anns[id]['caption']) tokens = nltk.tokenize.word_tokenize(caption.lower()) counter.update(tokens) if i % 1000 == 0: print("[%d/%d] Tokenized the captions." % (i, len(ids))) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Creates a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Adds the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
def build_corpus(self): print(f'Loading training trees from `{self.train_path}`...') if self.multitask == 'ccg': train_treebank = ccg.fromfile(self.train_path) else: with open(self.train_path) as f: train_treebank = [fromstring(line.strip()) for line in f] print(f'Loading development trees from `{self.dev_path}`...') with open(self.dev_path) as f: dev_treebank = [fromstring(line.strip()) for line in f] print(f'Loading test trees from `{self.test_path}`...') with open(self.test_path) as f: test_treebank = [fromstring(line.strip()) for line in f] if self.multitask == 'spans': # need trees with span-information train_treebank = [tree.convert() for tree in train_treebank] dev_treebank = [tree.convert() for tree in dev_treebank] test_treebank = [tree.convert() for tree in test_treebank] print("Constructing vocabularies...") if self.vocab_path is not None: print(f'Using word vocabulary specified in `{self.vocab_path}`') with open(self.vocab_path) as f: vocab = json.load(f) words = [word for word, count in vocab.items() for _ in range(count)] else: words = [word for tree in train_treebank for word in tree.words()] if self.multitask == 'none': labels = [] else: labels = [label for tree in train_treebank for label in tree.labels()] if self.multitask == 'none': words = [UNK, START] + words else: words = [UNK, START, STOP] + words word_vocab = Vocabulary.fromlist(words, unk_value=UNK) label_vocab = Vocabulary.fromlist(labels) self.word_vocab = word_vocab self.label_vocab = label_vocab self.train_treebank = train_treebank self.dev_treebank = dev_treebank self.test_treebank = test_treebank print('\n'.join(( 'Corpus statistics:', f'Vocab: {word_vocab.size:,} words, {label_vocab.size:,} nonterminals', f'Train: {len(train_treebank):,} sentences', f'Dev: {len(dev_treebank):,} sentences', f'Test: {len(test_treebank):,} sentences')))
def get_vocabulary(cls, vocab_path, captions_path, tokenized_captions, threshold=1): # Load or construct vocabulary if os.path.exists(vocab_path): vocab = Vocabulary.load(vocab_path) else: vocab = cls.build_vocab(captions_path, tokenized_captions, threshold) #TODO: check if saving is safe Vocabulary.save(vocab, vocab_path) print("Saved the vocabulary to '%s'" %vocab_path) return vocab
def from_serializable(cls, contents): """Instantiate a ReviewVectorizer from a serializable dictionary Args: contents (dict): the serializable dictionary Returns: an instance of the ReviewVectorizer class """ review_vocab = Vocabulary.from_serializable(contents['review_vocab']) rating_vocab = Vocabulary.from_serializable(contents['rating_vocab']) return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)
def from_dataframe(cls, review_df, cutoff=25): """Instantiate the vectorizer from the dataset dataframe Args: review_df (pandas.DataFrame): the review dataset cutoff (int): the parameter for frequency-based filtering Returns: an instance of the ReviewVectorizer """ review_vocab = Vocabulary(add_unk=True) rating_vocab = Vocabulary(add_unk=False) # Add ratings for rating in sorted(set(review_df.rating)): rating_vocab.add_token(rating) # Add top words if count > provided count word_counts = Counter() for review in review_df.review: for word in review.split(" "): if word not in string.punctuation: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: review_vocab.add_token(word) return cls(review_vocab, rating_vocab)
def __init__(self, args): self.args = args train = DataLoader(self.args.trainpath) dev = DataLoader(self.args.devpath) self.train_words, self.train_poss, self.train_chunks, self.train_labels = train.get_all_train_tokens( ) self.train_max_sentence_len, self.train_max_word_len = train.get_required_max_len( ) self.dev_words, self.dev_poss, self.dev_chunks, self.dev_labels = dev.get_all_train_tokens( ) self.dev_max_sentence_len, self.dev_max_word_len = dev.get_required_max_len( ) vocabulary = Vocabulary(self.train_words) self.vocab = vocabulary.get_word_vocab() self.char_vocab = vocabulary.get_char_vocab() self.train_vect = Vectorizer(self.train_max_sentence_len, self.train_max_word_len, self.vocab, self.char_vocab, self.train_words) self.dev_vect = Vectorizer(self.train_max_sentence_len, self.train_max_word_len, self.vocab, self.char_vocab, self.dev_words) self.poss_vect = LabelEncoderModel(self.train_poss, self.train_max_sentence_len) self.chunks_vect = LabelEncoderModel(self.train_chunks, self.train_max_sentence_len) self.labels_vect = LabelEncoderModel(self.train_labels, self.train_max_sentence_len) #st wrong here self.pos_emb_weights = self.poss_vect.get_emb_weights() self.chunk_emb_weights = self.chunks_vect.get_emb_weights() self.word_emb_weights, self.word_emb_dimensions = PretrainedEmbedder( self.vocab, self.args.pretrained_path).pretrained_embedder() self.model = ModelTraining( self.args.dropout, self.args.lr, len(set(sum(self.train_labels, []))), len(self.vocab), len(self.char_vocab), self.train_max_word_len, len(set(sum(self.train_poss, []))), len(set(sum(self.train_chunks, []))), word_emb_dimensions=self.word_emb_dimensions, word_emb_weights=self.word_emb_weights, pos_emb_weights=self.pos_emb_weights, chunk_emb_weights=self.chunk_emb_weights).model_build()
def train(self, epoch): trace('making vocabularies ...') self.trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab) trace('making model ...') trace('epoch %d/%d: ' % (epoch + 1, self.epoch)) opt = optimizers.AdaGrad(lr=0.01) opt.setup(self.encdec) opt.add_hook(optimizer.GradientClipping(5)) gen1 = gens.word_list(self.target) gen = gens.batch(gen1, self.minibatch) for trg_batch in gen: self.batch_size = len(trg_batch) self.trg_batch = fill_batch(trg_batch) if len(trg_batch) != self.minibatch: break self.encdec.clear(self.batch_size) self.__forward_img() self.encdec.reset(self.batch_size) loss, hyp_batch = self.__forward_word(self.trg_batch, self.encdec, True, 0) loss.backward() opt.update() K = len(self.trg_batch) - 2 self.print_out(K, hyp_batch, epoch)
def prepare_eval_data(config): """ Prepare the data for evaluating the model. """ coco = COCO(config.eval_caption_file) if config.is_person_model == 'Y': file_data = pd.read_csv(config.person_eval_caption_file) image_ids = file_data['image_id'].values image_files = file_data['image_file'].values else: image_ids = list(coco.imgs.keys()) image_files = [os.path.join(config.eval_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids] print("Building the vocabulary...") if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size) print("Dataset built.") return coco, dataset, vocabulary
def test_forward(self): with initialize_config_dir(config_dir=get_config_directory()): data_folder, dataset_name = get_test_data_info() config = compose("main", overrides=[ f"data_folder={data_folder}", f"dataset.name={dataset_name}" ]) dataset_folder = join(config.data_folder, config.dataset.name) vocabulary = Vocabulary.load_vocabulary( join(dataset_folder, config.vocabulary_name)) data_file_path = join( dataset_folder, f"{config.dataset.name}.{config.train_holdout}.c2s") dataset = PathContextDataset(data_file_path, config, vocabulary, False) batch = PathContextBatch( [dataset[i] for i in range(config.hyper_parameters.batch_size)]) model = PathEncoder( config.encoder, config.decoder.decoder_size, len(vocabulary.token_to_id), vocabulary.token_to_id[PAD], len(vocabulary.node_to_id), vocabulary.node_to_id[PAD], ) output = model(batch.contexts) true_shape = (sum(batch.contexts_per_label), config.decoder.decoder_size) self.assertTupleEqual(true_shape, output.shape)
def main(): args = get_args() vocab = Vocabulary.load(args.vocab_prefix.strip()) output_dir = path.realpath(args.output_dir.strip()) if args.soseos: line2arr = partial(line2arr_with_soseos, vocab) print("sos-eos!!!") else: line2arr = partial(line2arr_no_soseos, vocab) counter = -1 for line in sys.stdin.readlines(): counter += 1 # print(counter) # sys.stdout.flush() # counter += 1 if counter % 100 == 0: print(counter) sys.stdout.flush() fname = line.strip() # try: lines = open(fname, 'r').readlines() stripped = map(lambda x: x.strip(), lines) non_empty = filter(lambda x: x != "", stripped) file_arr = [line2arr(line) for line in non_empty ] np_arr = np.array(file_arr) try: new_fname = path.join(output_dir, path.split(fname)[1].replace(".tok", ".npy")) np.save(new_fname, np_arr) except: print("errored out on: {0}".format(fname))
def prepare_eval_data(config): """ Prepare the data for evaluating the model. """ coco = COCO(config.eval_caption_file) image_ids = list(coco.imgs.keys()) image_files = [ os.path.join(config.eval_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] print("Building the vocabulary...") if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) print("Building the dataset...") if (config.eval_data_count_limit > 0): print("-----------------------------------------------") print("Restricting Sz:\t", config.eval_data_count_limit) print("Batch Sz:\t", config.batch_size) image_ids = image_ids[0:config.eval_data_count_limit] image_files = image_files[0:config.eval_data_count_limit] """ Dump the image paths to a file """ filepath = 'eval_images.csv' with open(filepath, 'w') as file_handler: for i in range(0, config.eval_data_count_limit): file_handler.write("{}\n".format(image_files[i])) #print(image_files) print("-----------------------------------------------") dataset = DataSet(image_ids, image_files, config.batch_size) print("Dataset built.") return coco, dataset, vocabulary
def __init__(self, args, data_mode='train', single_pass=False, batch_size=None): self.args = copy.deepcopy(args) if batch_size is not None: self.args.batch_size = batch_size self.vocab = Vocabulary(self.args) self.data_mode = data_mode self.single_pass = single_pass
def get_huffman_tree(params): if "huff_tree_loc" in params: with open(params["huff_tree_loc"], 'rb') as f: huff_tree = pickle.load(f) else: vocab_size = params["n_vocab"] soseos_counts_estim = [40114695 for i in range(2)] vocab = Vocabulary.load( "/hdd/data/nlp/raw/unzipped/ff15_book/vocabs/final_vocab") sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True) sorted_counts = [x[1] for x in sorted_vocab] cutoff_counts = sorted_counts[0:vocab_size] oov_counts = [sum(sorted_counts[vocab_size:])] # print("#words: {0}".format(len(sorted_vocab))) # print("cutoff oov = {0}".format(sorted_vocab[vocab_size])) # print("oov words right after cutoff:") # print([x[0] for x in sorted_vocab[vocab_size:vocab_size + 50]]) # print("randomly sampled oov words:") # print(random.sample([x[0] for x in sorted_vocab[vocab_size:]], 50)) oov_percent = (100.0 * oov_counts[0]) / sum(cutoff_counts) # print("oov % = {0:.5f}".format(oov_percent)) all_counts = soseos_counts_estim + cutoff_counts + oov_counts params["vocab_counts"] = all_counts as_hash = {i: v for (i, v) in enumerate(all_counts)} huff_tree = chainer.links.BinaryHierarchicalSoftmax.create_huffman_tree( as_hash) print("loaded huffman tree") return huff_tree
def prepare_test_data(config, image_path=None): """ Prepare the data for testing the model. """ if image_path is None: files = os.listdir(config.test_image_dir) image_files = [ os.path.join(config.test_image_dir, f) for f in files if f.lower().endswith('.jpg') or f.lower().endswith('.jpeg') ] else: image_files = [image_path] image_ids = list(range(len(image_files))) print("Building the vocabulary...") if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size) print("Dataset built.") return dataset, vocabulary
def dataset(): vocab = Vocabulary(args) dataset = Dataset(args, vocab) source_files = sorted(glob.glob(args.dataset_file_path + 'train_source*.dat')) target_files = sorted(glob.glob(args.dataset_file_path + 'train_target*.dat')) print('========== Begin someting about vocabulary:') print('Vocab Size:', dataset.vocab.vocab_size) print('First 10 Word2cnt:', list(dataset.vocab._word2cnt.items())[:10]) print() print('========== Begin someting about dataset:') X_lens = [len(sen.split()) for source_file in source_files for sen in open(source_file)] y_lens = [len(sen.split()) for target_file in target_files for sen in open(target_file)] print('Number of Source Sentences:', len(X_lens)) print('Number of Sarget Sentences:', len(y_lens)) print() print('Mean Length of Source Sentences:', np.mean(X_lens)) print('Max Length of Source Sentences:', np.max(X_lens)) print('Min Length of Source Sentences:', np.min(X_lens)) print() print('Mean Length of Target Sentences:', np.mean(y_lens)) print('Max Length of Target Sentences:', np.max(y_lens)) print('Min Length of Target Sentences:', np.min(y_lens)) print()
def main3(): master_vocab = Vocabulary.load("./vocabs/final_vocab") offsets = master_vocab.get_offsets() with open("./vocabs/offsets.pkl", 'wb') as out_f: pickle.dump(offsets, out_f) import pdb pdb.set_trace() print(32)
def build_vocabulary(config): """ Build the vocabulary from the training data and save it to a file. """ vqa = VQA(config.train_answer_file, config.train_question_file) vqa.filter_by_ques_len(config.max_question_length) vqa.filter_by_ans_len(1) question_ids = list(vqa.qa.keys()) questions = [vqa.qqa[k]['question'] for k in question_ids] answers = [vqa.qa[k]['best_answer'] for k in question_ids] vocabulary = Vocabulary() for question in tqdm(questions): vocabulary.add_words(word_tokenize(question)) for answer in tqdm(answers): vocabulary.add_words(word_tokenize(answer)) vocabulary.compute_frequency() vocabulary.save(config.vocabulary_file) return vocabulary
def train(config: DictConfig): filter_warnings() print_config(config) seed_everything(config.seed) known_models = {"code2seq": get_code2seq, "code2class": get_code2class, "typed-code2seq": get_typed_code2seq} if config.name not in known_models: print(f"Unknown model: {config.name}, try on of {known_models.keys()}") vocabulary = Vocabulary.load_vocabulary(join(config.data_folder, config.dataset.name, config.vocabulary_name)) model, data_module = known_models[config.name](config, vocabulary) # define logger wandb_logger = WandbLogger( project=f"{config.name}-{config.dataset.name}", log_model=True, offline=config.log_offline ) wandb_logger.watch(model) # define model checkpoint callback checkpoint_callback = ModelCheckpoint( dirpath=wandb_logger.experiment.dir, filename="{epoch:02d}-{val_loss:.4f}", period=config.save_every_epoch, save_top_k=-1, ) upload_checkpoint_callback = UploadCheckpointCallback(wandb_logger.experiment.dir) # define early stopping callback early_stopping_callback = EarlyStopping( patience=config.hyper_parameters.patience, monitor="val_loss", verbose=True, mode="min" ) # define callback for printing intermediate result print_epoch_result_callback = PrintEpochResultCallback("train", "val") # use gpu if it exists gpu = 1 if torch.cuda.is_available() else None # define learning rate logger lr_logger = LearningRateMonitor("step") trainer = Trainer( max_epochs=config.hyper_parameters.n_epochs, gradient_clip_val=config.hyper_parameters.clip_norm, deterministic=True, check_val_every_n_epoch=config.val_every_epoch, log_every_n_steps=config.log_every_epoch, logger=wandb_logger, gpus=gpu, progress_bar_refresh_rate=config.progress_bar_refresh_rate, callbacks=[ lr_logger, early_stopping_callback, checkpoint_callback, upload_checkpoint_callback, print_epoch_result_callback, ], resume_from_checkpoint=config.resume_from_checkpoint, ) trainer.fit(model=model, datamodule=data_module) trainer.test()
def __init__(self, root, split, vocabulary='./utils/vocabulary.txt', transform=None): self.root = root self.split = split with open(os.path.join(self.root, 'talk2car_w_rpn_no_duplicates.json'), 'rb') as f: data = json.load(f)[self.split] self.data = {int(k): v for k, v in data.items()} # Map to int self.img_dir = os.path.join(self.root, 'images') self.transform = transform self.vocabulary = Vocabulary(vocabulary) if self.split in ['val', 'train']: self.add_train_annos = True # Add extra info when reading out items for training else: self.add_train_annos = False self.ignore_index = 255 # Ignore index when all RPNs < 0.5 IoU self.num_rpns_per_image = 8 # We only use 32 RPN per image # self.text_encoder = SentenceTransformer('bert-base-nli-stsb-mean-tokens') # Filter out rpns we are not going to use # RPNS were obtained from center after soft NMS # We order the scores, and take the top k. assert (self.num_rpns_per_image < 65) rpns = {k: sample['centernet'] for k, sample in self.data.items()} rpns_score_ordered_idx = { k: np.argsort([rpn['score'] for rpn in v]) for k, v in rpns.items() } rpns = { k: [ v[idx] for idx in rpns_score_ordered_idx[k][-self.num_rpns_per_image:] ] for k, v in rpns.items() } for k in self.data.keys(): self.data[k]['centernet'] = rpns[k]
def build_vocabulary(config, max_ann_num=None): """ Build the vocabulary from the training data and save it to a file. """ coco = COCO(config.train_caption_file, config.max_train_ann_num) coco.filter_by_cap_len(config.max_caption_length) vocabulary = Vocabulary(config.vocabulary_size) if not config.max_train_ann_num: vocabulary.build(coco.all_captions()) else: vocabulary.build((coco.all_captions())[:config.max_train_ann_num]) vocabulary.save(config.vocabulary_file) return vocabulary
def build_vocabulary(config, captions, oracle_file): print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols) if True: #not os.path.exists(config.vocabulary_file): vocabulary.build(captions) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) #print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) #return vocabulary print("NUM CAPTIONS: " + str(len(captions))) if not os.path.exists(config.temp_data_file): word_idxs = [] sent_lens = [] for caption in captions: current_word_idxs, current_length = vocabulary.process_sentence( caption) current_num_words = min(config.max_caption_length - 2, current_length) pad_length = config.max_caption_length - current_length - 2 current_word_idxs = [config._START_ ] + current_word_idxs[:current_num_words] + [ config._END_ ] + [config._PAD_] * pad_length word_idxs.append(current_word_idxs) sent_lens.append(current_num_words + 2) word_idxs = np.array(word_idxs) data = {'word_idxs': word_idxs, 'sentence_len': sent_lens} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] sent_lens = data['sentence_len'] if oracle_file is not None: with open(oracle_file, 'w') as outfile: paras = "" for line in word_idxs: for word in line: paras += (str(word) + ' ') paras += '\n' outfile.write(paras) return vocabulary
def prepare_test_data(config): """ Prepare the data for testing the model. """ if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) print("Vocabulary built.") print("Number of words = %d" %(vocabulary.size)) return vocabulary
def prepare_test_data(config): """ Prepare the data for testing the model. """ image_files = [config.test_file_name] image_ids = list(range(len(image_files))) if os.path.exists(config.vocabulary_file): vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file) else: vocabulary = build_vocabulary(config) dataset = DataSet(image_ids, image_files, config.batch_size) return dataset, vocabulary
def prepare_train_data(config): """ Prepare the data for training the model. """ if not os.path.exists(config.prepare_annotation_dir): os.mkdir(config.prepare_annotation_dir) coco = COCO(config, config.train_caption_file, config.val_caption_file) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): coco.filter_by_cap_len(config.max_caption_length) vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) vocabulary.save_counts(config.word_count_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" %(vocabulary.size)) print("Processing the captions...") if not os.path.exists(config.train_csv_file): coco.filter_by_words(set(vocabulary.words)) captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.dataset_image_dir, 'train' if coco.imgs[image_id]['file_name'].find('train2014')>=0 else 'val', coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'caption': captions}) annotations.to_csv(config.train_csv_file) else: annotations = pd.read_csv(config.train_csv_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values
def build_vocab(): vocab = Vocabulary(args) with open(processed_cnn_vocab_file, 'r') as vocab_f: for line in vocab_f: pieces = line.split() if len(pieces) != 2: print('Warning: incorrectly formatted line in vocabulary file: %s\n' % line) continue w = pieces[0] cnt = int(pieces[1]) if w in [opt.SENTENCE_START, opt.SENTENCE_END, opt.UNKNOWN_TOKEN, opt.PAD_TOKEN, opt.BOS, opt.EOS]: raise Exception( '<s>, </s>, [UNK], [PAD], [START] and [STOP] shouldn\'t be in the vocab file, but %s is' % w) if w in vocab._word2id: raise Exception('Duplicated word in vocabulary file: %s' % w) vocab._word2id[w] = vocab.vocab_size vocab._id2word[vocab.vocab_size] = w vocab.vocab_size += 1 vocab._word2cnt[w] = cnt if vocab.args.max_vocab_size != 0 and vocab.vocab_size >= vocab.args.max_vocab_size: print("max_size of vocab was specified as %i; we now have %i words. Stopping reading." % (vocab.args.max_vocab_size, vocab.vocab_size)) vocab.save_vocab() break print("Finished constructing vocabulary of %i total words. Last word added: %s" % (vocab.vocab_size, vocab._id2word[vocab.vocab_size - 1]))
def __init__(self, use_gpu, gpu_id): self.parameter_dict = {} train_path = APP_ROOT + "/../../Chainer_Image_Caption_Neural_Network/Code/Data/" self.resize_image_path = APP_ROOT + "/../../Chainer_Image_Caption_Neural_Network/Code/" self.parameter_dict["id2image"] = train_path + "index2img_exclude.txt" self.parameter_dict["id2caption"] = train_path + "index2caption.txt" self.parameter_dict["target"] = train_path + "index2caption.txt" self.parameter_dict["vocab"] = 5000 self.parameter_dict["embed"] = 300 self.parameter_dict["hidden"] = 200 self.parameter_dict["epoch"] = 20 self.parameter_dict["minibatch"] = 110 self.parameter_dict["generation_limit"] = 256 self.parameter_dict["use_gpu"] = use_gpu self.parameter_dict["gpu_id"] = gpu_id self.parameter_dict["choose_model"] = "Alex_Model" if self.parameter_dict["choose_model"] == "Alex_Model": self.insize = 224 if self.parameter_dict["choose_model"] == "AlexBn_Model": self.insize = 227 mean_image = pickle.load(open("mean.npy", 'rb')) cropwidth = 256 - self.insize self.start = cropwidth // 2 self.stop = self.start + self.insize self.mean_image = mean_image[:, self.start:self.stop, self.start:self.stop].copy() self.x_batch = np.ndarray((self.parameter_dict["minibatch"], 3, self.insize, self.insize), dtype=np.float32) self.y_batch = np.ndarray((self.parameter_dict["minibatch"]), dtype=np.int32) self.trg_vocab = Vocabulary.new(gens.word_list(self.parameter_dict["target"]), self.parameter_dict["vocab"]) self.read_data = Read_Data(self.parameter_dict["id2image"], "Data/val2014_resize", self.parameter_dict["id2caption"]) self.read_data.load_image_list() self.read_data.load_caption_list()
def test(self): trace('loading model ...') self.trg_vocab = Vocabulary.load("model/" + self.model + '.trgvocab') self.batch_size = len(trg_batch) encdec = EncoderDecoder.load_spec("model/" + self.model + '.spec') serializers.load_hdf5("model/" + self.model + '.weights', encdec) trace('generating translation ...') generated = 0 with open(self.target, 'w') as fp: self.__forward_img() trace('sample %8d ...' % (generated + 1)) hyp_batch = self.__forward_word(self.trg_batch, encdec, False, self.generation_limit) for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] print('hyp : ' +''.join(hyp)) print(' '.join(hyp), file=fp) trace('finished.')
def test(self): trace('loading model ...') trg_vocab = Vocabulary.load(self.model + '.trgvocab') self.encdec = EncoderDecoderAttention.load_spec(self.model + '.spec') serializers.load_hdf5(self.model + '.weights', self.encdec) trace('generating translation ...') generated = 0 trace('sample %8d - %8d ...' % (generated + 1, generated)) hyp_batch = self.forward(trg_vocab, False, self.generation_limit) source_cuont = 0 with open(self.target, 'w') as fp: for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[: hyp.index('</s>')] print('hyp : ' + ''.join(hyp)) fp.write(' '.join(hyp)) source_cuont = source_cuont + 1 trace('finished.')
def train(self): trace('making vocabularies ...') trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab) trace('making model ...') for epoch in range(self.epoch): trace('epoch %d/%d: ' % (epoch + 1, self.epoch)) trained = 0 opt = optimizers.AdaGrad(lr=0.01) opt.setup(self.encdec) opt.add_hook(optimizer.GradientClipping(5)) gen1 = gens.word_list(self.target) gen = gens.batch(gen1, self.minibatch) random_number = random.randint(0, self.minibatch - 1) for trg_batch in gen: self.trg_batch = fill_batch(trg_batch) if len(self.trg_batch) != self.minibatch: break hyp_batch, loss = self.forward(trg_vocab, self.use_gpu, self.gpu_id) loss.backward() opt.update() K = len(self.trg_batch) if trained == 0: self.print_out(random_number, epoch, trained, hyp_batch) trained += K trace('saving model ...') prefix = self.model trg_vocab.save(prefix + '.trgvocab') self.encdec.save_spec(prefix + '.spec') serializers.save_hdf5(prefix + '.weights', self.encdec) trace('finished.')
class Dataset(object): def __init__(self, args, data_mode='train', single_pass=False, batch_size=None): self.args = copy.deepcopy(args) if batch_size is not None: self.args.batch_size = batch_size self.vocab = Vocabulary(self.args) self.data_mode = data_mode self.single_pass = single_pass @property def examples(self): source_files = sorted(glob.glob(self.args.dataset_file_path + self.data_mode + '_source*.dat')) target_files = sorted(glob.glob(self.args.dataset_file_path + self.data_mode + '_target*.dat')) extract_id_files = sorted(glob.glob(self.args.dataset_file_path + self.data_mode + '_tag*.dat')) assert len(source_files) != 0 assert len(source_files) == len(target_files) assert len(extract_id_files) == len(source_files) example_files = list(zip(source_files, target_files, extract_id_files)) if self.data_mode == 'test': example_files = sorted(example_files) else: random.shuffle(example_files) for (source_file, target_file, extract_id_file) in example_files: if self.args.original_result: source_file = './tmp/cnn_dm_processed/dataset/train_source_108.dat' target_file = './tmp/cnn_dm_processed/dataset/train_target_108.dat' # source_file = './tmp/cnn_dm_processed/dataset/train_source_108.dat' # target_file = './tmp/cnn_dm_processed/dataset/train_target_108.dat' # source_file = './tmp/cnn_dm_processed/dataset/train_source_83.dat' # target_file = './tmp/cnn_dm_processed/dataset/train_target_83.dat' # debug for specify file # source_file = './tmp/byte_cup_2018/dataset/test_source_59.dat' # target_file = './tmp/byte_cup_2018/dataset/test_target_59.dat' print(self.data_mode + 'ing:', source_file) print(self.data_mode + 'ing:', target_file) print(self.data_mode + 'ing:', extract_id_file) with open(source_file) as f: train_X_list = [sentence.strip() for sentence in f] with open(target_file) as f: train_y_list = [sentence.strip() for sentence in f] with open(extract_id_file) as f: train_extract_list = [sentence.strip() for sentence in f] if not self.args.original_result: train_unity = sorted(zip(train_X_list, train_y_list, train_extract_list), key=lambda x: len(x[0].split()), reverse=True) train_X_list, train_y_list, train_extract_list = zip(*train_unity) # tuple for (article, abstract, extract_id_str) in zip(train_X_list, train_y_list, train_extract_list): if self.args.target_split: abstract_sentences = [sent.strip() for sent in self.vocab.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences. abstract = ' '.join(abstract_sentences) # abstract_sentences will be used in beam_search, not clear... example = Example(self.args, article, abstract, extract_id_str, self.vocab) # Process into an Example. # if example.X_len < self.args.min_source_length \ # or example.X_len > self.args.max_source_length: # 原来设置的是截取句子长度, 现在这里是根据句子数量来截取 if len(example.extract_ids) < 4 or len(example.extract_ids) > self.args.pos_num: # doesn't care max_source_length because of sentence selector continue yield example @property def batches(self): example_generator = self.examples while True: try: if self.data_mode != 'train' and self.single_pass: # beam search decode mode single example repeated in the batch ex = next(example_generator) batch = [ex for _ in range(self.args.batch_size)] yield Batch(self.args, batch, self.vocab) else: inputs = [next(example_generator) for _ in range(self.args.batch_size * (self.args.bucket_cache_size if self.data_mode == 'train' else 1))] inputs = sorted(inputs, key=lambda inp: inp.X_len, reverse=True) # sort by length of encoder sequence batches = [inputs[i:i + self.args.batch_size] for i in range(0, len(inputs), self.args.batch_size)] shuffle(batches) for batch in batches: # each b is a list of Example objects batch = Batch(self.args, batch, self.vocab) if sum(batch.X_doc_lens) == sum([len(ex_extract_ids) for ex_extract_ids in batch.batch_extract_ids]): # maybe they are different yield batch else: print("my warning: number of batch_extract_ids doesn't match X_doc_lens") except StopIteration: print('StopIteration, Examples of this epoch is done in dataset batches, data mode', self.data_mode) break