def read_extracted_data(self) -> None: """ Read existed data. Data consists of common data and specific mode data. """ # Common data if isfile(DatasetConfig.common_raw_data_file): common_data: CommonData = load_pkl( DatasetConfig.common_raw_data_file) self.dialog_vocab = common_data.dialog_vocab self.glove = common_data.glove self.image_url_id = common_data.image_url_id self.image_paths = common_data.image_paths # Specific mode data if self.mode & TRAIN_MODE and isfile( DatasetConfig.train_raw_data_file): train_data = load_pkl(DatasetConfig.train_raw_data_file) self.train_dialogs = train_data if self.mode & VALID_MODE and isfile( DatasetConfig.valid_raw_data_file): valid_data = load_pkl(DatasetConfig.valid_raw_data_file) self.valid_dialogs = valid_data if self.mode & TEST_MODE and isfile(DatasetConfig.test_raw_data_file): test_data = load_pkl(DatasetConfig.test_raw_data_file) self.test_dialogs = test_data
def train_setup(self): from featlists.ci import feats as flist assert "train_pkl" in self.params and "dev_pkl" in self.params logger.info(" * TRAINING: %s stage * " % self.name) vocab = util.load_pkl(self.params["vocab_fn"]) train_data = util.load_pkl(self.params["train_pkl"]) val_data = util.load_pkl(self.params["dev_pkl"]) test_data = util.load_pkl(self.params["test_pkl"]) if "test_pkl" in self.params else None self.fx = Fxtractor(fx_maps=collections.defaultdict(), labels=vocab["ctypes"], flist=flist) self.fx.fx_dataset_ci(train_data, train=True) self.fx.fx_dataset_ci(val_data, train=False) self.fx.fx_maps["vidx"] = copy.deepcopy(vocab["vidx"]) n_features = self.fx.fx_maps["ci_fdim"] n_classes = len(self.fx.fx_maps["ci_l2i"]) self.clf = Perceptron.SparsePerceptron() self.clf.init_w(n_features, n_classes) self.ruleset = Ruleset() self.ruleset.load_db() logger.debug("Data: %d (train), %d (dev). Num feats %d" % (len(train_data), len(val_data), n_features)) logger.debug("Used features: \n%s", "\n".join(self.fx.flist)) return vocab, train_data, val_data, test_data
def train_setup(self): from featlists.ri import feats as flist assert "train_pkl" in self.params and "dev_pkl" in self.params logger.info(" * TRAINING: %s stage * " % self.name) vocab = util.load_pkl(self.params["vocab_fn"]) train_data = util.load_pkl(self.params["train_pkl"]) val_data = util.load_pkl(self.params["dev_pkl"]) test_data = util.load_pkl(self.params["test_pkl"]) if "test_pkl" in self.params else None labels = ["<null>", "<unk>"] + vocab["edges"].keys() self.fx = Fxtractor(fx_maps=collections.defaultdict(), labels=labels, flist=flist) self.fx.fx_dataset_ri(train_data, train=True) self.fx.fx_dataset_ri(val_data, train=False) n_features = self.fx.fx_maps["ri_fdim"] n_classes = len(self.fx.fx_maps["ri_l2i"]) self.clf = Perceptron.SparsePerceptron() self.clf.init_w(n_features, n_classes) self.amr_printer = Printer() logger.debug("Data: %d (train), %d (dev). Num feats %d" % (len(train_data), len(val_data), n_features)) logger.debug("Used features: \n%s", "\n".join(self.fx.flist)) return vocab, train_data, val_data, test_data
def save_data(self): """ Save QNAS data in a pickle file for logging and reloading purposes, including chromosomes, generation number, evaluation score and number of evaluations. Note that the data in the file is loaded and updated with the current generation, so that we keep track of the entire evolutionary process. """ if self.current_gen == 0: data = dict() else: data = load_pkl(self.data_file) data[self.current_gen] = { 'time': str(datetime.datetime.now()), 'total_eval': self.total_eval, 'best_so_far': self.best_so_far, 'best_so_far_id': self.best_so_far_id, 'fitnesses': self.fitnesses, 'raw_fitnesses': self.raw_fitnesses, 'lower': self.qpop_params.lower, 'upper': self.qpop_params.upper, 'params_pop': self.qpop_params.current_pop, 'net_probs': self.qpop_net.probabilities, 'num_net_nodes': self.qpop_net.chromosome.num_genes, 'net_pop': self.qpop_net.current_pop } self.dump_pkl_data(data)
def load_qnas_data(self, file_path): """ Read pkl data in *file_path* and load its information to current QNAS. It also saves its info into the new pkl data file *self.data_file*. Args: file_path: (str) path to the pkl data file. """ log_data = load_pkl(file_path) if not os.path.exists(self.data_file): self.dump_pkl_data(log_data) generation = max(log_data.keys()) log_data = log_data[generation] self.current_gen = generation self.total_eval = log_data['total_eval'] self.best_so_far = log_data['best_so_far'] self.best_so_far_id = log_data['best_so_far_id'] self.qpop_net.chromosome.set_num_genes(log_data['num_net_nodes']) self.fitnesses = log_data['fitnesses'] self.raw_fitnesses = log_data['raw_fitnesses'] self.qpop_params.lower = log_data['lower'] self.qpop_params.upper = log_data['upper'] self.qpop_net.probabilities = log_data['net_probs'] self.qpop_params.current_pop = log_data['params_pop'] self.qpop_net.current_pop = log_data['net_pop']
def load_evolved_data(self, generation=None, individual=0): """ Read the yaml log *self.files_spec['data_file']* and get values from the individual specified by *generation* and *individual*. Args: generation: (int) generation number from which data will be loaded. If None, loads the last generation data. individual: (int) number of the classical individual to be loaded. If no number is specified, individual 0 is loaded (the one with highest fitness on the given *generation*. """ log_data = load_pkl(self.files_spec['data_file']) if generation is None: generation = max(log_data.keys()) log_data = log_data[generation] params_pop = log_data['params_pop'] net_pop = log_data['net_pop'] assert individual < net_pop.shape[0], \ "The individual number cannot be bigger than the size of the population!" params = QChromosomeParams( params_ranges=self.QNAS_spec['params_ranges']).decode(params_pop[individual]) net = QChromosomeNetwork( fn_list=self.QNAS_spec['fn_list'], max_num_nodes=log_data['num_net_nodes']).decode(net_pop[individual]) self.evolved_params = {'params': params, 'net': net}
def tfidf(data_set, space_path): if os.path.exists(space_path): tfidf_space = load_pkl(space_path) else: vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf_space = transformer.fit_transform( vectorizer.fit_transform(data_set)) dump_pkl(tfidf_space, space_path) print('tfidf shape:', tfidf_space.shape) return tfidf_space
def load_params(exp_path, generation=None, individual=0): """ Load the parameters from *exp_path/log_params_evolution.txt* and the data from *exp_path/data_QNAS.txt*. The data loaded is the network encoded by individual *individual* of generation *generation*. Args: exp_path: (str) path to the directory containing evolution files. generation: (int) the generation number of the individual to be profiled. If *None*, the last generation will be used. individual: (int) the number of the individual in *generation* to be profiled. Returns: dict holding all the necessary parameters and data. """ log_file_path = os.path.join(exp_path, 'log_params_evolution.txt') log_data_path = os.path.join(exp_path, 'data_QNAS.pkl') params = load_yaml(log_file_path) log_data = load_pkl(log_data_path) input_shape = (1, params['train_data_info']['height'], params['train_data_info']['width'], params['train_data_info']['num_channels']) # Load last generation, if it is not specified if generation is None: generation = max(log_data.keys()) log_data = log_data[generation] nets = log_data['net_pop'] net = QChromosomeNetwork( fn_list=params['QNAS']['fn_list'], max_num_nodes=params['QNAS']['max_num_nodes']).decode(nets[individual]) loaded_params = { 'individual_id_str': f"Generation {generation} - individual {individual}", 'individual_id': (generation, individual), 'net_list': net, 'input_shape': input_shape, 'num_classes': params['train_data_info']['num_classes'], 'fn_dict': params['fn_dict'], 'fn_list': params['QNAS']['fn_list'] } return loaded_params
def __init__(self): self.styletips_data: StyleTipsData = None self.celebrity_data: CelebrityData = None self.attribute_data: AttributeData = None if isfile(DatasetConfig.knowledge_data_file): # Read existed extracted data files. knowledge_data = load_pkl(DatasetConfig.knowledge_data_file) self.styletips_data = knowledge_data.styletips_data self.celebrity_data = knowledge_data.celebrity_data self.attribute_data = knowledge_data.attribute_data else: # Load data from raw data file and save them into pkl. self.styletips_data = StyleTipsData.from_file() self.celebrity_data = CelebrityData.from_file() self.attribute_data = AttributeData.from_file() save_pkl(self, 'KnowledgeData', DatasetConfig.knowledge_data_file)
def main(params): captions_file = params.captions_file output_file = params.output_file vids = util.load_pkl(captions_file) st_model = skipthoughts.load_model() skip_vectors = {} for vid in vids.keys(): caps = vids[vid] num_caps = len(caps) raw_caps = ['' for x in range(num_caps)] for cap in caps: raw_caps[int(cap['cap_id'])] = cap['tokenized'] vector = skipthoughts.encode(st_model, raw_caps, verbose=False) skip_vectors[vid] = vector util.dump_pkl(skip_vectors, output_file)
from mlxtend.plotting import plot_decision_regions import matplotlib.pyplot as plt from algs import load_alg import util from feature_data import DataFeatures data = util.load_pkl('data/ml/graph_features.pkl') alg = load_alg('Random_Forest') clf = alg.clf X = data.get_joint_matrix(util.features) train_x = X[data.train_indices] # train_y = data.labels[data.train_indices] val_x = X[data.val_indices] # val_y = data.labels[data.val_indices] y = data.labels[data.train_indices] for i in range(len(y)): if i > 50: y[i] = 1 else: y[i] = 0 # Fix feature values feature_ind = [1, 2] filler_vals = {0: 1, 3: 1, 4: 1, 5: 1} filler_ranges = {0: 1, 3: 1, 4: 1, 5: 1} # feature_cols = {0 : 'degree', 1: 'clustering', 2: 'comm_edge_density', # 3: 'comm_sz', 4: 'comm_review_count', 5: 'split'}
def train(task: int, model_file_name: str): """Train model. Args: task (int): Task. model_file_name (str): Model file name (saved or to be saved). """ # Check if data exists. if not isfile(DatasetConfig.common_raw_data_file): raise ValueError('No common raw data.') # Load extracted common data. common_data: CommonData = load_pkl(DatasetConfig.common_raw_data_file) # Dialog data files. train_dialog_data_file = DatasetConfig.get_dialog_filename( task, TRAIN_MODE) valid_dialog_data_file = DatasetConfig.get_dialog_filename( task, VALID_MODE) test_dialog_data_file = DatasetConfig.get_dialog_filename(task, TEST_MODE) if not isfile(train_dialog_data_file): raise ValueError('No train dialog data file.') if not isfile(valid_dialog_data_file): raise ValueError('No valid dialog data file.') # Load extracted dialogs. train_dialogs: List[TidyDialog] = load_pkl(train_dialog_data_file) valid_dialogs: List[TidyDialog] = load_pkl(valid_dialog_data_file) test_dialogs: List[TidyDialog] = load_pkl(test_dialog_data_file) if task in {KNOWLEDGE_TASK}: knowledge_data = KnowledgeData() # Dataset wrap. train_dataset = Dataset( task, common_data.dialog_vocab, None, #common_data.obj_id, train_dialogs, knowledge_data if task == KNOWLEDGE_TASK else None) valid_dataset = Dataset( task, common_data.dialog_vocab, None, #common_data.obj_id, valid_dialogs, knowledge_data if task == KNOWLEDGE_TASK else None) test_dataset = Dataset( task, common_data.dialog_vocab, None, #common_data.obj_id, test_dialogs, knowledge_data if task == KNOWLEDGE_TASK else None) print('Train dataset size:', len(train_dataset)) print('Valid dataset size:', len(valid_dataset)) print('Test dataset size:', len(test_dataset)) # Get initial embedding. vocab_size = len(common_data.dialog_vocab) embed_init = get_embed_init(common_data.glove, vocab_size).to(GlobalConfig.device) # Context model configurations. context_text_encoder_config = ContextTextEncoderConfig( vocab_size, embed_init) context_image_encoder_config = ContextImageEncoderConfig() context_encoder_config = ContextEncoderConfig() # Context models. context_text_encoder = TextEncoder(context_text_encoder_config) context_text_encoder = context_text_encoder.to(GlobalConfig.device) context_image_encoder = ImageEncoder(context_image_encoder_config) context_image_encoder = context_image_encoder.to(GlobalConfig.device) context_encoder = ContextEncoder(context_encoder_config) context_encoder = context_encoder.to(GlobalConfig.device) # Load model file. model_file = join(DatasetConfig.dump_dir, model_file_name) if isfile(model_file): state = torch.load(model_file) # if task != state['task']: # raise ValueError("Task doesn't match.") context_text_encoder.load_state_dict(state['context_text_encoder']) context_image_encoder.load_state_dict(state['context_image_encoder']) context_encoder.load_state_dict(state['context_encoder']) # Task-specific parts. if task == INTENTION_TASK: intention_train(context_text_encoder, context_image_encoder, context_encoder, train_dataset, valid_dataset, test_dataset, model_file) elif task == TEXT_TASK: text_train(context_text_encoder, context_image_encoder, context_encoder, train_dataset, valid_dataset, test_dataset, model_file, common_data.dialog_vocab, embed_init) elif task == RECOMMEND_TASK: recommend_train(context_text_encoder, context_image_encoder, context_encoder, train_dataset, valid_dataset, test_dataset, model_file, vocab_size, embed_init) elif task == KNOWLEDGE_TASK: knowledge_attribute_train(context_text_encoder, context_image_encoder, context_encoder, train_dataset, valid_dataset, test_dataset, model_file, knowledge_data.attribute_data, common_data.dialog_vocab, embed_init)
def load_alg(name): path = 'data/ml/results/' + name + '.pkl' if os.path.isfile(path): return util.load_pkl(path) model_name = name.split('-')[0] return Algorithm(name, util.model_dict[model_name])
import config from util import tokenize, preprocess, load_pkl, dump_pkl bigram_path = 'data/kenlm/zhwiki_bigram.klm' bigram = kenlm.Model(bigram_path) print('Loaded bigram language model from {}'.format(bigram_path)) trigram_path = 'data/kenlm/zhwiki_trigram.klm' trigram = kenlm.Model(trigram_path) print('Loaded trigram language model from {}'.format(trigram_path)) text_path = 'data/train_input.txt' text_counter_path = 'data/train_input_counter.pkl' # 字频统计 if os.path.exists(text_counter_path): char_counter = load_pkl(text_counter_path) else: print('generate counter from text file:', text_path) char_counter = Counter((codecs.open(text_path, 'r', encoding='utf-8').read())) dump_pkl(char_counter, text_counter_path) def load_same_pinyin(path, sep='\t'): """ 加载同音字 :param path: :return: """ result = dict() if not os.path.exists(path): print("file not exists:", path)
if __name__ == "__main__": feature_path = 'data/ml/graph_features.pkl' x = DataFeatures(folder, True) for name in names: opts = {} if name == "Decision_Tree": opts = { 'min_samples_split': 20, 'max_features': 'log2', 'min_samples_leaf': 20 } name += "-extra" a = algs.load_alg(name) data = util.load_pkl(feature_path) a.run(data, util.features, clf_options=opts) a.to_csv() # folder = 'edge_rem_split_angle_norm' # feature_path = 'data/ml/graph_features_{}.pkl'.format(folder) # x = DataFeatures(folder, True) # for name in names: # opts = {} # if name == "Decision_Tree": # opts = {'min_samples_split': 20, 'max_features': 'log2', 'min_samples_leaf': 20} # name += "-ROUND2-angle-norm" # a = algs.load_alg(name) # data = util.load_pkl(feature_path) #
def retrieve_data(): X_train, X_test, y_train, y_test = util.load_pkl( "../cleaned_data/data-new.pkl") return X_train, X_test, y_train, y_test
import matplotlib.pyplot as plt import numpy as np import util seed_text = util.load_pkl('./results/seed_text_gru4.pkl') print("seed_text: ", seed_text) theta = util.load_pkl('./results/theta_gru4.pkl') #theta = np.exp(theta) #theta = theta / np.sum(theta) theta = theta[0] print("theta: ", theta) N = len(theta) x = range(N) width = 1 / 2.0 plt.bar(x, theta, width, color="blue") plt.title("Inferred Topic Distribution from TopicGRU", fontsize=20) plt.savefig("theta_gru4.pdf") plt.show()
def retrieve_model(): model = util.load_pkl("../models/NN.pkl") return model