def _init_token_data(self): to_tokenize = {} for mode in ["train", "test"]: token_data_attr = "_{mode}_tokens".format(mode=mode) token_data_file = "_{mode}_tokens.pkl".format(mode=mode) token_data_path = join(self._gen_dir, token_data_file) if exists(token_data_path): token_data = unpickle_file(token_data_path) setattr(self, token_data_attr, token_data) else: data_dict_attr = "_{mode}_dict".format(mode=mode) data_dict = getattr(self, data_dict_attr) to_tokenize[mode] = data_dict if to_tokenize: #! Regardless of buckets, all vocab must be tokenized, #! otherwise risk experiment failing with empty target include = set(self._vocab) | set( corpora_vocab(self._train_corpus, self._test_corpus)) include_tokens_path = join(self._gen_dir, "_incl_tokens.pkl") pickle_file(path=include_tokens_path, data=include) tokens_dict = tokenize_data( include=include, case_insensitive=self._embedding.case_insensitive, **to_tokenize, ) for mode, token_data in tokens_dict.items(): token_data_attr = "_{mode}_tokens".format(mode=mode) token_data_file = "_{mode}_tokens.pkl".format(mode=mode) token_data_path = join(self._gen_dir, token_data_file) pickle_file(path=token_data_path, data=token_data) setattr(self, token_data_attr, token_data)
def _init_corpus(self, mode): corpus_attr = "_{mode}_corpus".format(mode=mode) corpus_file = "_{mode}_corpus.pkl".format(mode=mode) corpus_path = join(self._gen_dir, corpus_file) if exists(corpus_path): corpus = unpickle_file(corpus_path) else: corpora = (lower_corpus(getattr(dataset, corpus_attr)) if self._embedding.case_insensitive else getattr( dataset, corpus_attr) for dataset in self.datasets) corpus = merge_corpora(*corpora) pickle_file(path=corpus_path, data=corpus) setattr(self, corpus_attr, corpus)
def _init_data_dict(self, mode): data_dict_attr = "_{mode}_dict".format(mode=mode) data_dict_file = "_{mode}_dict.pkl".format(mode=mode) data_dict_path = join(self._gen_dir, data_dict_file) if exists(data_dict_path): data_dict = unpickle_file(data_dict_path) else: data_dicts = (getattr(dataset, data_dict_attr) for dataset in self.datasets) data_dict = accumulate_dicts(*data_dicts) pickle_file(path=data_dict_path, data=data_dict) class_labels = self._class_labels or [] class_labels = set(class_labels + data_dict["labels"]) self._class_labels = list(class_labels) setattr(self, data_dict_attr, data_dict)
def generate_dataset_files(args): dataset_name = args.dataset_name or basename(normpath(args.path)) parsing_fn = get_parsing_fn(args.path, args.parser_name) ftrain, ftest = get_raw_file_paths(args.path) train_dict, test_dict = get_dataset_dicts(ftrain, ftest, parsing_fn) target_path = join(DATASET_DATA_PATH, dataset_name) if exists(target_path): if not args.force: cprnt(warn="Dataset '{}' already exists, use -f to overwrite". format(dataset_name)) return cprnt(info="Overwriting previous '{}' dataset".format(dataset_name)) rmtree(target_path) makedirs(target_path) pickle_file(join(target_path, "_train_dict.pkl"), train_dict) pickle_file(join(target_path, "_test_dict.pkl"), test_dict) return dataset_name
def _init_data_dict(self, mode, redist): data_dict_file = "_{mode}_dict.pkl".format(mode=mode) data_dict_attr = "_{mode}_dict".format(mode=mode) redist = redist.get(mode) if isinstance(redist, dict) else redist srcdir_attr = "_{mode}_srcdir".format(mode=mode) data_dict_dir = getattr(self, srcdir_attr) data_dict_path = join(data_dict_dir, data_dict_file) if exists(data_dict_path): data_dict = unpickle_file(data_dict_path) elif redist: source_data_dict_path = join(self.gen_dir, data_dict_file) source_data_dict = unpickle_file(source_data_dict_path) data_dict = resample_data_dict(source_data_dict, redist) pickle_file(path=data_dict_path, data=data_dict) class_labels = self._class_labels or [] class_labels = set(class_labels + data_dict["labels"]) self._class_labels = list(class_labels) setattr(self, data_dict_attr, data_dict)
def _init_corpus(self, mode): corpus_pkl_file = "_{mode}_corpus.pkl".format(mode=mode) srcdir_attr = "_{mode}_srcdir".format(mode=mode) corpus_pkl_dir = getattr(self, srcdir_attr) corpus_pkl_path = join(corpus_pkl_dir, corpus_pkl_file) if exists(corpus_pkl_path): corpus = unpickle_file(corpus_pkl_path) else: dict_attr = "_{mode}_dict".format(mode=mode) data_dict = getattr(self, dict_attr) # docs = data_dict["sentences"] docs = [ " ".join([s[:o].strip(), t, s[(o + len(t)):].strip()]) for s, t, o in zip( data_dict["sentences"], data_dict["targets"], data_dict["offsets"], ) ] corpus = generate_corpus(docs, mode) pickle_file(data=corpus, path=corpus_pkl_path) corpus_attr = "_{mode}_corpus".format(mode=mode) setattr(self, corpus_attr, corpus)
def _vocab_coverage(self): _ci = self._embedding.case_insensitive v_orig = set(self._embedding.vocab) v_extd = set(self._vocab) v_train = set( sum( accumulate_dicts( self._train_tokens, accum_fn=(lambda prev, curr: list(set(prev) | set(curr))), default=lambda v=None: set(sum(v, [])) if v else set(), ).values(), [], )) v_test = set( sum( accumulate_dicts( self._test_tokens, accum_fn=(lambda prev, curr: list(set(prev) | set(curr))), default=lambda v=None: set(sum(v, [])) if v else set(), ).values(), [], )) v_train_oov_over_t = ( set(w for w in v_train if self._train_corpus.get(w) >= self._oov_train_threshold) - v_orig) v_tot = v_train | v_test v_oov = v_tot - v_orig vocab_data_path = join(self._gen_dir, "vocab.pkl") if exists(vocab_data_path): self._vocab_data = unpickle_file(vocab_data_path) else: self._vocab_data = { "_ts": datetime.now().strftime("%d/%m/%Y %H:%M:%S"), "_threshold": self._oov_train_threshold, "embedding": { "original": v_orig, "extended": v_extd }, "datasets": { "total": v_tot, "oov": v_oov, "train": { "total": v_train, "oov": { "total": v_train - v_orig, "embedded": v_train_oov_over_t, "bucketed": v_train - v_extd, }, }, "test": { "total": v_test, "oov": { "total": v_test - v_orig, "bucketed": v_test - v_extd, "exclusive": v_test - (v_extd | v_train), }, }, }, } pickle_file(path=vocab_data_path, data=self._vocab_data) n_tot = len(v_tot) n_oov = len(v_oov) n_train = len(v_train) n_test = len(v_test) n_train_oov = len(v_train - v_orig) n_train_oov_embd = len(v_train_oov_over_t) n_train_oov_bktd = len(v_train - v_extd) n_test_oov = len(v_test - v_orig) n_test_oov_bktd = len(v_test - v_extd) n_test_oov_excl = len(v_test - (v_extd | v_train)) portion = lambda p, tot=None: str(p) + (" ({:.2f}%)".format( (p / tot) * 100) if tot else "") return { "total": { "size": n_tot, "in_vocab": portion(n_tot - n_oov, tot=n_tot), "out_of_vocab": portion(n_oov, tot=n_tot), }, "train": { "size": n_train, "oov": { "total": portion(n_train_oov, tot=n_train), "embedded": portion(n_train_oov_embd, tot=n_train), **({ "bucketed": portion(n_train_oov_bktd, tot=n_train) } if n_train_oov_bktd > 0 else {}), }, }, "test": { "size": n_test, "oov": { "total": portion(n_test_oov, tot=n_test), "bucketed": portion(n_test_oov_bktd, tot=n_test), **({ "exclusive": portion(n_test_oov_excl, tot=n_test) } if n_train_oov_bktd > 0 else {}), }, }, }
def _init_tfrecords(self): tokens_lists = {} for mode in ["train", "test"]: tfrecord_folder = "_{mode}".format(mode=mode) tfrecord_path = join(self._gen_dir, tfrecord_folder) if not exists(tfrecord_path): tokens_attr = "_{mode}_tokens".format(mode=mode) tokens_dict = getattr(self, tokens_attr) tokens_list = tokens_dict.values() tokens_lists[mode] = sum(tokens_list, []) fetch_results_path = join(self._gen_dir, "_fetch_results.pkl") if tokens_lists and not exists(fetch_results_path): vocab_file_templ = "_vocab{ext}" filtered_vocab_file = vocab_file_templ.format(ext=".filt.txt") filtered_vocab_path = join(self._gen_dir, filtered_vocab_file) if not exists(filtered_vocab_path): filtered_vocab = list( set(self._vocab) & set(corpora_vocab(self._train_corpus, self._test_corpus)) ) indices = [self._vocab.index(word) for word in filtered_vocab] write_vocab_file(filtered_vocab_path, filtered_vocab, indices) #! There has to be at least 1 bucket for any #! test-time oov tokens (possibly targets) lookup_table = ids_lookup_table( filtered_vocab_path, self._num_oov_buckets, vocab_size=len(self._vocab), ) fetch_dict = fetch_lookup_ops(lookup_table, **tokens_lists) fetch_results = run_lookups(fetch_dict, metadata_path=self.gen_dir) pickle_file(path=fetch_results_path, data=fetch_results) else: fetch_results = unpickle_file(fetch_results_path) oov_buckets = {} for mode, values in fetch_results.items(): data_dict_attr = "_{mode}_dict".format(mode=mode) data_dict = getattr(self, data_dict_attr) string_features, int_features = split_list(values, parts=2) tfexamples = make_tf_examples(string_features, int_features, data_dict["labels"]) tfrecord_folder = "_{mode}".format(mode=mode) tfrecord_path = join(self._gen_dir, tfrecord_folder) if not exists(tfrecord_path): write_tfrecords(tfrecord_path, tfexamples) #! There has to be at least 1 bucket for any #! test-time oov tokens (possibly targets) buckets = [ BUCKET_TOKEN.format(num=n + 1) for n in range(self._num_oov_buckets) ] oov_buckets[mode] = tokens_by_assigned_id( string_features, int_features, start=len(self._vocab), keys=buckets, ) accum_oov_buckets = accumulate_dicts( **oov_buckets, accum_fn=lambda prev, curr: list(set(prev) | set(curr)), ) self._oov_buckets = { buckets[i]: accum_oov_buckets[buckets[i]] for i in sorted( [buckets.index(key) for key in [*accum_oov_buckets]]) }