def __init__(self): self._qa = du.json_load(_mp.qa_file) self._split = du.json_load(_mp.splits_file) self.video_data = du.json_load(_mp.video_data_file) self._inc = {'split': set(list(self._split.keys())), 'imdb_key': set([k for v in self._split.values() for k in v]), 'video_clips': {False}}
def main(): index = du.json_load(_mp.sample_index_file) subtitle = Subtitle().include(imdb_key=['tt0086190']).get() sample = du.json_load(_mp.sample_frame_file) qa = QA().include(imdb_key=['tt0086190']).get() # for ins in qa: # if ins['video_clips']: # print(ins['qid']) # print(ins['question']) # print(ins['answers']) # print(ins['answers'][ins['correct_index']]) ins = qa[0] spec = np.load(os.path.join(_mp.encode_dir, ins['qid'] + '_spec' + '.npy')) iid = [idx for i, idx in enumerate(index[ins['imdb_key']]) if spec[i] == 1] sentences = [subtitle[ins['imdb_key']]['lines'][idx] for idx in iid] imgs = [] for v in sorted([fu.basename_wo_ext(n) for n in ins['video_clips']]): imgs.extend([ os.path.join(_mp.image_dir, v, '%s_%05d.jpg' % (v, i + 1)) for i in sample[ins['imdb_key']][v] ]) print(len(imgs)) for idx, img in enumerate(imgs): copy( img, os.path.join(_mp.benchmark_dir, 'pickup', '%d_%s.jpg' % (idx, sentences[idx]))) # ins['lines'] = sentences du.json_dump(ins, os.path.join(_mp.benchmark_dir, 'pickup.json'))
def create_tfrecord(encode_qa, encode_subt, split, mode, num_per_shards): split_qa = [qa for qa in encode_qa if split in qa['qid']] video_data = du.json_load(_mp.video_data_file) imdb_split = du.json_load(_mp.splits_file) video_split = [k for k in video_data if k in imdb_split[split]] qa = {k: [qa for qa in split_qa if qa['imdb_key'] == k] for k in video_split} fu.make_dirs(dataset_dir) args = Args() args.subt_max, args.q_max, args.a_max = find_max_length(encode_qa, encode_subt) manager = Manager() split_qa = manager.dict(qa) encode_subt = manager.dict(encode_subt) video_data = manager.dict(video_data) args.split = split args.mode = mode args.num_per_shards = num_per_shards func = partial(create_one_tfrecord, split_qa, encode_subt, args, video_data) # num_shards = int(math.ceil(len(split_qa) / float(num_per_shards))) keys = list(split_qa.keys()) num_shards = len(keys) with Pool(8) as pool, tqdm(total=num_shards, desc='Create %s Tfrecord' % split) as pbar: for _ in pool.imap_unordered(func, keys): pbar.update()
def create_vocab(qa, subtitle, video_data, gram_vocab, gram_embed): if not os.path.exists(_mp.vocab_file): vocab = Counter() for key in tqdm(video_data, desc='Tokenize Subtitle'): subt = subtitle[key] for idx, line in enumerate(subt['lines']): line = wordpunct_tokenize(line.strip().lower()) vocab.update(line) subt['lines'][idx] = line for ins in tqdm(qa, desc='Tokenize QA'): ins['question'] = wordpunct_tokenize( ins['question'].strip().lower()) vocab.update(ins['question']) ins['answers'] = [ wordpunct_tokenize(sent.strip().lower()) if sent else ['.'] for sent in ins['answers'] ] for sent in ins['answers']: vocab.update(sent) filter_vocab, idx_vocab = {}, 1 frequency = {} vocab_embed = np.zeros((len(vocab) + 1, gram_embed.shape[1]), dtype=np.float32) for v in tqdm(vocab, desc='Create Embedding'): v_ = '<' + v + '>' v_gram = [c for c in v] + [v_[i:i + 3] for i in range(len(v_) - 2)] + \ [v_[i:i + 6] for i in range(len(v_) - 5)] v_gram_code = [ gram_vocab[gram] for gram in v_gram if gram in gram_vocab ] if v_gram_code: frequency[v] = vocab[v] filter_vocab[v] = idx_vocab vocab_embed[idx_vocab] = np.sum(gram_embed[v_gram_code], axis=0) idx_vocab += 1 vocab_embed = vocab_embed[:idx_vocab] du.json_dump(frequency, _mp.freq_file) du.json_dump(filter_vocab, _mp.vocab_file) np.save(_mp.embedding_file, vocab_embed) print(len(vocab_embed)) du.json_dump(subtitle, _mp.temp_subtitle_file) du.json_dump(qa, _mp.tokenize_qa) else: filter_vocab = du.json_load(_mp.vocab_file) subtitle = du.json_load(_mp.temp_subtitle_file) vocab_embed = np.load(_mp.embedding_file) frequency = du.json_load(_mp.freq_file) qa = du.json_load(_mp.tokenize_qa) return filter_vocab, subtitle, vocab_embed, frequency, qa
def main(): """ Main function. :return: None """ print('Start loading data...') qa = QA().include(video_clips=True).get() video_data = du.json_load(_mp.video_data_file) frame_time = FrameTime().get() subtitle = Subtitle().get() if args.mode == 0: gram_vocab = { k: i for i, k in enumerate(du.json_load(_ep.gram_vocab_file)) } gram_embed = np.load(_ep.gram_embedding_vec_file) filter_vocab, subtitle, vocab_embed, frequency, qa = \ create_vocab(qa, subtitle, video_data, gram_vocab, gram_embed) collect_embedding(qa, subtitle, video_data, filter_vocab, vocab_embed, frequency) elif args.mode == 1: gen_embedding(qa, subtitle, video_data) elif args.mode == 2: glove_vocab = { k: i for i, k in enumerate(du.json_load(_ep.glove_embedding_key_file)) } glove_embed = np.load(_ep.glove_embedding_vec_file) filter_vocab, subtitle, vocab_embed, frequency, qa = \ create_vocab(qa, subtitle, video_data, glove_vocab, glove_embed) collect_embedding(qa, subtitle, video_data, filter_vocab, vocab_embed, frequency) print('Process done!') sample = subtitle_process(video_data, frame_time, subtitle) for ins in tqdm(qa, desc='Create Spectrum'): video_list = sorted(list(sample[ins['imdb_key']].keys())) num_frame = sum([len(sample[ins['imdb_key']][v]) for v in video_list]) spectrum = np.zeros(num_frame, dtype=np.int32) index = 0 for v in video_list: num = len(sample[ins['imdb_key']][v]) if v + '.mp4' in ins['video_clips']: spectrum[index:index + num] = 1 index += num assert np.sum(spectrum) > 0, '%s no content needed.' % ins['qid'] np.save(os.path.join(_mp.encode_dir, ins['qid'] + '_spec' + '.npy'), spectrum)
def main(_): if FLAGS.test: print('Test tfrecords.') test() else: encode_qa = du.json_load(config.avail_encode_qa_file) encode_subtitle = du.json_load(config.encode_subtitle_file) print('Json file loading done !!') fu.make_dirs(config.dataset_dir) create_tfrecord(encode_qa['encode_qa_%s' % FLAGS.split], encode_subtitle, split=FLAGS.split, modality=FLAGS.modality, is_training=FLAGS.is_training)
def main(): args = parse_args() split = args.split encode_qa = du.json_load(_mp.encode_qa_file) encode_subt = du.json_load(_mp.encode_subtitle_file) if args.count: count(encode_qa) else: if 'train' in split: create_tfrecord(encode_qa, encode_subt, 'train', args.mode, args.num_per_shards) if 'val' in split: create_tfrecord(encode_qa, encode_subt, 'val', args.mode, args.num_per_shards) if 'tests' in split: create_tfrecord(encode_qa, encode_subt, 'tests', args.mode, args.num_per_shards)
def __init__(self): if exists(_mp.shot_boundary_file): self._sb = du.json_load(_mp.shot_boundary_file) else: self._sb = self.process() self._inc = {'imdb_key': set([k.split('.')[0] for k in self._sb]), 'videos': set([k for k in self._sb])}
def main(): with Manager() as manager, Pool(4) as pool: print('Loading data...') index = manager.dict(du.json_load(_mp.sample_index_file)) subtitle = manager.dict(Subtitle().get()) train_attn = manager.dict( dict( np.load(os.path.join(_mp.attn_dir, args.mod, 'train_attn.npz')))) val_attn = manager.dict( dict(np.load(os.path.join(_mp.attn_dir, args.mod, 'val_attn.npz')))) train_pair = manager.dict( dict( np.load(os.path.join(_mp.attn_dir, args.mod, 'train_pair.npz')))) val_pair = manager.dict( dict(np.load(os.path.join(_mp.attn_dir, args.mod, 'val_pair.npz')))) train_qa = QA().include(video_clips=True, split={'train'}).get() val_qa = QA().include(video_clips=True, split={'val'}).get() print('Loading done!') func = partial(gen_attn, train_attn, train_pair, index, subtitle) for _ in pool.imap_unordered( func, tqdm([train_qa[i] for i in range(0, len(train_qa), 1)])): pass func = partial(gen_attn, val_attn, val_pair, index, subtitle) for _ in pool.imap_unordered( func, tqdm([val_qa[i] for i in range(0, len(val_qa), 1)])): pass
def subtitle_process(video_data, frame_time, subtitle): if not exists(_mp.tokenize_subt): manager = Manager() tokenize_subt = manager.dict() video_data = manager.dict(video_data) frame_time = manager.dict(frame_time) subtitle = manager.dict(subtitle) keys = list(video_data.keys()) align_func = partial(align_subtitle, video_data, frame_time, subtitle, tokenize_subt) with Pool(4) as p, tqdm(total=len(keys), desc="Align subtitle") as pbar: for _ in p.imap_unordered(align_func, keys): pbar.update() res = tokenize_subt.copy() du.json_dump(res, _mp.tokenize_subt) else: res = du.json_load(_mp.tokenize_subt) return res
def test_npy(): video_data = du.json_load(config.video_data_file) for key in tqdm(video_data.keys(), desc="Check sanity of features"): if video_data[key]['avail']: feat = np.load(du.get_npy_name(config.feature_dir, key)) assert len(feat) == video_data[key]['info']['num_frames'], \ "Previous feature - %s is not aligned." % key
def main(): sample = du.json_load(_mp.sample_frame_file) for k in sample: print( k, '%.3f' % (sum([len(sample[k][v]) for v in sample[k]]) * 4 * 4 * 1536 / 1024 / 1024 / 1024), "Gb")
def check(): sample = du.json_load(_mp.sample_frame_file) for imdb_key in tqdm(sample, desc='Check..'): capacity = 0 videos = list(sample[imdb_key].keys()) videos.sort() for v in tqdm(videos): capacity += len(sample[imdb_key][v]) if load_shape(join(_mp.object_feature_dir, imdb_key + '.npy'))[0] != capacity: raise ValueError('F**k up! %s' % join(_mp.feature_dir, imdb_key + '.npy'))
def main(): video_qa = [qa for qa in du.json_load(_mp.qa_file) if qa['video_clips']] train = [qa for qa in video_qa if 'train' in qa['qid']] val = [qa for qa in video_qa if 'val' in qa['qid']] test = [qa for qa in video_qa if 'tests' in qa['qid']] with Pool(4) as pool, Manager() as manager: index = manager.dict(du.json_load(_mp.sample_index_file)) subtitle = manager.dict(Subtitle().get()) counter = manager.Value(int, 0) func = partial(do_attn, index, subtitle, counter) for _ in pool.imap_unordered(func, tqdm(train)): pass print('train acc: %.4f' % (counter.value / len(train))) counter = manager.Value(int, 0) func = partial(do_attn, index, subtitle, counter) for _ in pool.imap_unordered(func, tqdm(val)): pass print('val acc: %.4f' % (counter.value / len(val)))
def _select(self): if self.rest['auto'] or self.rest['continue']: exp_paths = glob(join(cp.log_dir, '**', '*.json'), recursive=True) experiments = [] for p in exp_paths: experiments.append(du.json_load(p)) if self.rest['auto']: self._auto(experiments) elif self.rest['continue']: self._continue(experiments) self.setup()
def __init__(self): self.test_pattern = join(_mp.dataset_dir, 'tests*.tfrecord') self.test_files = glob(self.test_pattern) self.test_files.sort() self._length = len([0 for qa in du.json_load(_mp.qa_file) if 'tests' in qa['qid'] and qa['video_clips']]) dataset = tf.data.Dataset.from_tensor_slices(self.test_files) dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=4, block_length=1).prefetch(16) dataset = dataset.map(test_parser, num_parallel_calls=4).prefetch(16) self.iterator = dataset.make_initializable_iterator() self.next_elements = self.iterator.get_next() self.initializer = self.iterator.initializer self.ques, self.ans, self.subt, self.feat, self.ql, self.al, self.sl = self.next_elements
def __init__(self, split='train', mode='feat+subt', shuffle=True): self.shuffle = shuffle vsqa = [qa for qa in du.json_load(_mp.qa_file) if qa['video_clips']] self.qa = [qa for qa in vsqa if split in qa['qid']] self.index = list(range(len(self))) self._feed_dict = { tf.placeholder(dtype=tf.string, shape=[None]): [join(_mp.encode_dir, qa['qid'] + '.npy') for qa in self.qa], tf.placeholder(dtype=tf.string, shape=[None]): [join(_mp.encode_dir, qa['imdb_key'] + '.npy') for qa in self.qa], tf.placeholder(dtype=tf.string, shape=[None]): [ join(_mp.object_feature_dir, qa['imdb_key'] + '.npy') for qa in self.qa ], tf.placeholder(dtype=tf.int64, shape=[None]): [qa['correct_index'] for qa in self.qa], tf.placeholder(dtype=tf.string, shape=[None]): [ join(_mp.encode_dir, qa['qid'] + '_spec' + '.npy') for qa in self.qa ], } self.placeholders = list(self.feed_dict.keys()) dataset = tf.data.Dataset.from_tensor_slices( self.placeholders[0]).repeat(1) func = partial(load, comp='qa', mode=mode) qa_dataset = dataset.map(func, num_parallel_calls=1).prefetch(1) dataset = tf.data.Dataset.from_tensor_slices( self.placeholders[1]).repeat(1) func = partial(load, comp='subt', mode=mode) subt_dataset = dataset.map(func, num_parallel_calls=2).prefetch(2) dataset = tf.data.Dataset.from_tensor_slices( self.placeholders[2]).repeat(1) func = partial(load, comp='feat', mode=mode) feat_dataset = dataset.map(func, num_parallel_calls=4).prefetch(4) gt_dataset = tf.data.Dataset.from_tensor_slices( self.placeholders[3]).repeat(1) dataset = tf.data.Dataset.from_tensor_slices( self.placeholders[4]).repeat(1) func = partial(load, comp='spec', mode=mode) spec_dataset = dataset.map(func, num_parallel_calls=1).prefetch(1) dataset = tf.data.Dataset.zip( (qa_dataset, subt_dataset, feat_dataset, gt_dataset, spec_dataset)) dataset = dataset.prefetch(128) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() (self.ques, self.ans), self.subt, self.feat, self.gt, self.spec = next_element self.gt = tf.expand_dims(self.gt, axis=0) self.next_element = (self.ques, self.ans, self.subt, self.feat, self.gt, self.spec) self.initializer = iterator.initializer
def main(): args = arg_parse() if args.rm: remove_all() qa = QA().include(video_clips=True).get() video_data = du.json_load(_mp.video_data_file) frame_time = FrameTime().get() subtitle = Subtitle().get() gram_vocab = { k: i for i, k in enumerate(du.json_load(_ep.gram_vocab_file)) } gram_embed = np.load(_ep.gram_embedding_vec_file) filter_vocab, subtitle, vocab_embed, frequency, qa = \ create_vocab(qa, subtitle, video_data, gram_vocab, gram_embed) collect_embedding(qa, subtitle, video_data, filter_vocab, vocab_embed, frequency) subtitle_process(video_data, frame_time, subtitle)
def main(): manager = Manager() encode_dict = manager.dict() # objfeat_dcit = manager.dict() # subtfeat_dict = manager.dict() objfeat_dcit = {} subtfeat_dict = {} func = partial(load_ques, encode_dict) with Pool(4) as pool, \ tqdm([qa for qa in du.json_load(_mp.qa_file) if qa['video_clips']]) as pbar: for _ in pool.imap_unordered(func, pbar): pass np.savez(_mp.qa_feature, **encode_dict.copy())
def get_images_path(): file_names, capacity, npy_names = [], [], [] video_data = dict(item for v in du.json_load(_mp.video_data_file).values() for item in v.items()) func = partial(collect, video_data) with Pool(16) as pool, tqdm(total=len(video_data), desc='Collect images') as pbar: for ins in pool.imap_unordered(func, list(video_data.keys())): if ins: npy_names.append(ins[0]) capacity.append(ins[1]) file_names.extend(ins[2]) pbar.update() return file_names, capacity, npy_names
def create_vocab_embedding(tokenize_subt, tokenize_qa): if not exists(_mp.embedding_file) or not exists(_mp.vocab_file): vocab = create_vocab(tokenize_subt, tokenize_qa) filter_vocab, idx_vocab = {}, 1 gram_vocab = { k: i for i, k in enumerate(du.json_load(_ep.gram_vocab_file)) } gram_embed = np.load(_ep.gram_embedding_vec_file) vocab_embed = np.zeros((len(vocab) + 1, gram_embed.shape[1]), dtype=np.float32) for idx, v in enumerate( tqdm(vocab, desc='Create embedding of vocabulary')): v_ = '<' + v + '>' v_gram = [c for c in v_] + [v_[i:i + 3] for i in range(len(v_) - 2)] + \ [v_[i:i + 6] for i in range(len(v_) - 5)] v_gram_code = [ gram_vocab[gram] for gram in v_gram if gram in gram_vocab ] if v_gram_code: filter_vocab[v] = idx_vocab vocab_embed[idx + 1] = np.sum(gram_embed[v_gram_code], axis=0) idx_vocab += 1 norm = np.linalg.norm(vocab_embed, axis=1, keepdims=True) norm = np.select([norm > 0], [norm], default=1.) print(norm.shape) norm_vocab_embed = vocab_embed / norm print(norm_vocab_embed.shape) du.json_dump(filter_vocab, _mp.vocab_file) np.save(_mp.embedding_file, norm_vocab_embed) else: filter_vocab = du.json_load(_mp.vocab_file) return filter_vocab
def encode_sentence(tokenize_subt, tokenize_qa, vocab): if not exists(_mp.encode_subtitle_file) or not exists(_mp.encode_qa_file): encode_subt, encode_qa = {}, tokenize_qa for imdb in tqdm(tokenize_subt, desc='Encode subtitle'): encode_subt[imdb] = {} for v in tokenize_subt[imdb]: encode_subt[imdb][v] = [] for sent in tokenize_subt[imdb][v]: temp = [vocab[w] for w in sent if w in vocab] if temp: encode_subt[imdb][v].append(temp) else: encode_subt[imdb][v].append([vocab['.']]) for ins in tqdm(encode_qa, desc='Encode question answer'): temp = [vocab[w] for w in ins['question'] if w in vocab] if temp: ins['question'] = temp else: ins['question'] = [vocab['.']] for idx, a in enumerate(ins['answers']): temp = [vocab[w] for w in a if w in vocab] if temp: ins['answers'][idx] = temp else: ins['answers'][idx] = [vocab['.']] du.json_dump(encode_subt, _mp.encode_subtitle_file) du.json_dump(encode_qa, _mp.encode_qa_file) else: encode_subt = du.json_load(_mp.encode_subtitle_file) encode_qa = du.json_load(_mp.encode_qa_file) return encode_subt, encode_qa
def main(): args = parse_args() split = args.split encode_qa = du.json_load(_mp.tokenize_qa) if args.count: count(encode_qa) else: if 'train' in split: create_tfrecord(encode_qa, 'train', args.mode, args.num_per_shards) if 'val' in split: create_tfrecord(encode_qa, 'val', args.mode, args.num_per_shards) if 'tests' in split: create_tfrecord(encode_qa, 'tests', args.mode, args.num_per_shards)
def check(): """ Check the numbers of image files in directories are same as ones in meta data. :return: None """ # video meta data video_data = du.json_load(_mp.video_data_file) for volume in video_data.values(): # each video for v in volume: img_dir = join(_mp.image_dir, v) true_length = len(glob(join(img_dir, '*.jpg'))) if volume[v]['real_frames'] != len(glob(join(img_dir, '*.jpg'))): # check the number print(v, true_length, volume[v]['real_frames'])
def get_images_path_v3(): file_names, capacity, npy_names = [], [], [] sample = du.json_load(_mp.sample_frame_file) for imdb_key in tqdm(sample, desc='Collect Images'): # if not exists(join(_mp.feature_dir, imdb_key + '.npy')): npy_names.append(join(_mp.feature_dir, imdb_key + '.npy')) videos = list(sample[imdb_key].keys()) videos.sort() num = 0 for v in tqdm(videos): images = [join(_mp.image_dir, v, '%s_%05d.jpg' % (v, i + 1)) for i in sample[imdb_key][v]] file_names.extend(images) num += len(images) capacity.append(num) # print(capacity, npy_names) return file_names, capacity, npy_names
def main(): args = arg_parse() if args.rm: remove_all() if args.max: print_max() else: qa = QA().include(video_clips=True).get() video_data = du.json_load(_mp.video_data_file) frame_time = FrameTime().get() subtitle = Subtitle().get() tokenize_subt = subtitle_process(video_data, frame_time, subtitle) tokenize_qa = tokenize_question_answer(qa) vocab = create_vocab_embedding(tokenize_subt, tokenize_qa) encode_subt, encode_qa = encode_sentence(tokenize_subt, tokenize_qa, vocab) save_encode(encode_subt, encode_qa)
def get_images_path_v2(): file_names, capacity, npy_names = [], [], [] video_data = du.json_load(_mp.video_data_file) for imdb_key in tqdm(video_data, desc='Collect Images'): npy_names.append(join(_mp.object_feature_dir, imdb_key + '.npy')) videos = list(video_data[imdb_key].keys()) videos.sort() num = 0 for v in tqdm(videos): images = [join(_mp.image_dir, v, '%s_%05d.jpg' % (v, i + 1)) for i in range(0, video_data[imdb_key][v]['real_frames'], 15)] file_names.extend(images) num += len(images) capacity.append(num) print(capacity, npy_names) return file_names, capacity, npy_names
def main(): tester = TestManager() try: for i in range(11, 21): args.extra = '%02d' % i tester.test() ans_file_list = glob.glob(os.path.join(_mp.test_dir, '*.json')) ans_dict = defaultdict(Counter) for f in ans_file_list: ans = du.json_load(f) for qid, a in ans.items(): ans_dict[qid].update([a]) with open('movie_results.txt', 'w') as f: for qid in sorted(ans_dict.keys(), key=lambda x: int(re.search(r'(\d+)\b', x)[0])): ta = ans_dict[qid].most_common(1)[0][0] f.write('%s %s\n' % (qid, ta)) except KeyboardInterrupt: print()
def __init__(self, shuffle=True, split='train', mode='feat+subt'): self.shuffle = shuffle # if 'subt' not in mode: # self.pattern = join(_mp.dataset_dir, '-'.join(['feat', split, '*.tfrecord'])) # elif 'feat' not in mode: # self.pattern = join(_mp.dataset_dir, '-'.join(['subt', split, '*.tfrecord'])) # else: self.pattern = join(_mp.dataset_dir, split + '*.tfrecord') self._files = glob(self.pattern) self._files.sort() self._length = len([ 0 for qa in du.json_load(_mp.qa_file) if split in qa['qid'] and qa['video_clips'] ]) parser = partial(dual_parser, mode=mode) self.placeholder = tf.placeholder(tf.string, [None]) dataset = tf.data.Dataset.from_tensor_slices(self.placeholder) dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=4, block_length=1).prefetch(8) dataset = dataset.map(parser, num_parallel_calls=4).prefetch(8) if shuffle: dataset = dataset.shuffle(32) self.iterator = dataset.make_initializable_iterator() self.next_elements = self.iterator.get_next() self.initializer = self.iterator.initializer print(*[i.get_shape() for i in self.next_elements]) if 'subt' not in mode: self.ques, self.ans, self.gt, self.feat, self.spec = self.next_elements elif 'feat' not in mode: self.ques, self.ans, self.gt, self.subt, self.spec = self.next_elements else: self.ques, self.ans, self.gt, self.subt, self.feat, self.spec = self.next_elements
def load_embedding_vec(target): """ Load word embedding of different method. You can refer to EmbeddingPath to prepare data. :param target: string, target word embedding :return embedding_keys, embedding_vecs: list of words, and numpy array of embedding vector """ start_time = time.time() # File and function setup for embedding if target == 'glove': key_file = cp.glove_embedding_key_file vec_file = cp.glove_embedding_vec_file raw_file = cp.glove_file load_fn = load_glove elif target == 'w2v': key_file = cp.w2v_embedding_key_file vec_file = cp.w2v_embedding_vec_file raw_file = cp.word2vec_file load_fn = load_w2v elif target == 'fasttext': key_file = cp.ft_embedding_key_file vec_file = cp.ft_embedding_vec_file raw_file = cp.fasttext_file load_fn = load_glove else: key_file = None vec_file = None raw_file = None load_fn = None # Check if there already exists pre-loaded file if exists(key_file) and exists(vec_file): embedding_keys = du.json_load(key_file) embedding_vecs = np.load(vec_file) else: # If not, load it from text file embedding_keys, embedding_vecs = load_fn(raw_file) du.json_dump(embedding_keys, key_file) np.save(vec_file, embedding_vecs) print('Loading embedding done. %.3f s' % (time.time() - start_time)) return embedding_keys, embedding_vecs