def dev(model): data_helper = DataHelper(mode='dev') total_pred = 0 correct = 0 accuracy = 0 b_size = len(data_helper.label) print('*' * 100) print('dev set total:', b_size) loss_func = torch.nn.MSELoss(reduction='sum') ##reduction='sum' loss_mae = torch.nn.L1Loss(reduction='sum') iter = 0 total_loss = 0 for content, label, _ in data_helper.batch_iter(batch_size=b_size, num_epoch=1): iter += 1 model.eval() ##need modify the regression task will minimize the mse error pred = model(content) pred_sq = torch.squeeze(pred, 1) loss = loss_func(pred_sq.cpu().data, label.cpu()) #------------------------------------------------# error = loss_mae(pred_sq.cpu().data, label.cpu()) # error = mean_absolute_error(pred_sq.cpu().data, label.cpu()) accuracy += error total_pred = len(label) total_pred = float(total_pred) accuracy = float(accuracy) #return the overall accuracy return (accuracy / total_pred), (float(loss) / total_pred)
def model_test(model): """ """ data = sio.loadmat("./data/fake_data.mat") dh = DataHelper(data, 19*7*24, 1*7*24, 50) feature, flow, label = dh.gen_test_samples() tfeature = feature[:, :model._tf_dim] sfeature = feature[:, -model._sf_dim:] err = model.decompose() saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(os.path.dirname("checkpoints/checkpoint")) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: return feed_dict = { model._sfeature: sfeature, model._tfeature: tfeature, model._flow: flow } err, stfeature = sess.run([err, model.stfeature], feed_dict=feed_dict) ano_detect(flow, err, stfeature, label)
def cal_PMI(window_size=20): helper = DataHelper(mode="train") content, _ = helper.get_content() pair_count_matrix = np.zeros((len(helper.vocab), len(helper.vocab)), dtype=int) word_count = np.zeros(len(helper.vocab), dtype=int) for sentence in content: sentence = sentence.split(' ') for i, word in enumerate(sentence): try: word_count[helper.d[word]] += 1 except KeyError: continue start_index = max(0, i - window_size) end_index = min(len(sentence), i + window_size) for j in range(start_index, end_index): if i == j: continue else: target_word = sentence[j] try: pair_count_matrix[helper.d[word], helper.d[target_word]] += 1 except KeyError: continue total_count = np.sum(word_count) word_count = word_count / total_count pair_count_matrix = pair_count_matrix / total_count pmi_matrix = np.zeros((len(helper.vocab), len(helper.vocab)), dtype=float) for i in range(len(helper.vocab)): for j in range(len(helper.vocab)): pmi_matrix[i, j] = np.log(pair_count_matrix[i, j] / (word_count[i] * word_count[j])) if pmi_matrix[i, j] <= 0: continue pmi_matrix = np.nan_to_num(pmi_matrix) pmi_matrix = np.maximum(pmi_matrix, 0.0) edges_weights = [0.0] count = 1 edges_mappings = np.zeros((len(helper.vocab), len(helper.vocab)), dtype=int) for i in range(len(helper.vocab)): for j in range(len(helper.vocab)): if pmi_matrix[i, j] != 0: edges_weights.append(pmi_matrix[i, j]) edges_mappings[i, j] = count count += 1 edges_weights = np.array(edges_weights) edges_weights = edges_weights.reshape(-1, 1) # print(edges_weights.shape) edges_weights = torch.Tensor(edges_weights) return edges_weights, edges_mappings, count
def model_train(model): """ """ data = sio.loadmat("./data/fake_data.mat") dh = DataHelper(data, 19*7*24, 1*7*24, 50) model.construct_loss() optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(model.loss) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) ckpt = tf.train.get_checkpoint_state(os.path.dirname("checkpoints/checkpoint")) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) train_feature_batches, train_flow_batches = dh.gen_train_batch(BATCH_SIZE) for i in range(EPOCH): for x0, y0 in zip(train_feature_batches, train_flow_batches): shuffled_index = np.arange(BATCH_SIZE) np.random.shuffle(shuffled_index) x1 = x0[shuffled_index] y1 = y0[shuffled_index] feed_dict = { model.x0: x0, model.y0: y0, model.x1: x1, model.y1: y1, } _, loss = sess.run([optimizer, model.loss], feed_dict=feed_dict) print("Epoch {}: loss {}".format(i, loss)) saver.save(sess, "checkpoints/model")
def graph_eval(core_words): # print('load model from file.') data_helper = DataHelper('test') edges_num, edges_matrix = edges_dist(len(data_helper.vocab), data_helper, 1) model = torch.load(os.path.join('temp_model.pkl')) content, label = data_helper.get_content() edges_weights = model.seq_edge_w.weight.to('cpu').detach().numpy() graph_ed = [] for core_word in core_words: # core_index = data_helper.vocab.index(core_word) results = {} unq_res = {} for i in range(len(data_helper.vocab)): word = data_helper.vocab[i] n_word = edges_matrix[i, core_index] if n_word != 0: results[word] = edges_weights[n_word][0] else: continue for value, key in results.items(): if value not in unq_res: unq_res[value] = key sort_results = sorted(unq_res.items(), key=lambda d: d[1]) graph_ed.append(sort_results) # print(sort_results) return graph_ed
def graph_eval_extend(core_words): # print('load model from file.') data_helper = DataHelper('test') edges_num, edges_matrix = edges_mapping1(len(data_helper.vocab), data_helper, 1) model = torch.load(os.path.join('temp_model.pkl')) content, label = data_helper.get_content() edges_weights = model.seq_edge_w.weight.to('cpu').detach().numpy() graph_ed = [] for core_word in core_words: # other = [] core_index = data_helper.vocab.index(core_word) for cc in core_words: if cc != core_word: o_ind = data_helper.vocab.index(cc) other.append(o_ind) results = {} for i in range(len(other)): word = data_helper.vocab[other[i]] n_word = edges_matrix[other[i], core_index] results[word] = edges_weights[n_word][0] unq_res = {} for value, key in results.items(): if value not in unq_res: unq_res[value] = key sort_results = sorted(unq_res.items(), key=lambda d: d[1]) graph_ed.append(sort_results) # print(sort_results) return graph_ed
def _get_data(data_name='citeulike_title_only'): data_helper = DataHelper() def error(): assert False, '[ERROR] unseen data_name %s' % data_name sub_folder = '' fold = re.findall('fold(\d+)', data_name) if len(fold) == 1: sub_folder = 'fold%d' % int(fold[0]) if data_name.startswith('citeulike'): if data_name.startswith('citeulike_title_only'): content_file = data_root + '/citeulike/title_only/%s/data_content.pkl' % sub_folder split_file = data_root + '/citeulike/title_only/%s/data_split_cold_item.pkl' % sub_folder elif data_name.startswith('citeulike_title_and_abstract'): content_file = data_root + '/citeulike/title_and_abstract/%s/data_content.pkl' % sub_folder split_file = data_root + '/citeulike/title_and_abstract/%s/data_split_cold_item.pkl' % sub_folder else: error() elif data_name.startswith('news'): if data_name.startswith('news_title_only'): content_file = data_root + '/news/title_only/%s/data_content.pkl' % sub_folder split_file = data_root + '/news/title_only/%s/data_split_cold_item.pkl' % sub_folder elif data_name.startswith('news_title_and_abstract'): content_file = data_root + '/news/title_and_abstract/%s/data_content.pkl' % sub_folder split_file = data_root + '/news/title_and_abstract/%s/data_split_cold_item.pkl' % sub_folder else: error() # data_helper.load_data(content_file) with open(split_file) as fp: split_data = pickle.load(fp) data_helper.data = split_data return data_helper
def __init__(self): """ Constructor """ self.dh = DataHelper() self.ds = DaysStatistics() self.prediction_steps = 288
def __init__(self, estimator_name, param_dict): """ Constructor of MyGridSearch. Prepares all necessary variables and prepares generator for grid search. :param estimator_name: String with name of sklearn predictor. Must be exactly the same as sklearn name. This is because this string is used for creation of predictor. Not the best solution but working one. :param param_dict: Disctionary with parameters to search through. Key must be the exact name of parameter input for predictor, value must be list of sutable values. Current implementation have problems with strings so only numeric values are supported now. """ inspector = inspect.getfullargspec(eval(estimator_name)) for key in param_dict.keys(): assert key in inspector.args, 'Argument %s is not valid for class %s' % (key, estimator.__class__.__name__) self.estimator_name = estimator_name self.param_dict = param_dict self.evaluation = [] self.parameters = [] self.best_parameters = None self.best_evaluation = None self.best_estimator = None self.arguments = None self.generator = None self.grid_size = 1 self.dh = DataHelper() self.prepare_generator()
def _eval_model(self, img_filenames, labels_filenames, model): dhl = DataHelper() gt_lbls = [] pr_lbls = [] k = 0 for i, file_name in tqdm(enumerate(img_filenames)): lbl = int(load(self.annotation_path + labels_filenames[i])) # print(lbl) # print(type(lbl)) if lbl == 0 or lbl == 1 or lbl == 2 or lbl == 6: img = np.expand_dims( np.array(imread(self.img_path + file_name)) / 255.0, axis=0) gt_lbls.append( dhl.load_and_relabel_exp(self.annotation_path + labels_filenames[i])) # gt_lbls.append(dhl.load_and_categorize_valence(self.annotation_path + labels_filenames[i])) prediction = model(img)[0] score = tf.nn.softmax(prediction) pr_lbls.append(np.argmax(score)) print('Gt => ' + str(gt_lbls[k]) + ' : ' + str(pr_lbls[k]) + ' <= Pr') k += 1 print(confusion_matrix(gt_lbls, pr_lbls)) acc = accuracy_score(gt_lbls, pr_lbls) return acc
def train(self): """ train """ gpu_options = tf.GPUOptions(allow_growth=True, per_process_gpu_memory_fraction=1.0) session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options) with tf.Session(config=session_conf) as sess: sess.run(tf.global_variables_initializer()) current_step = 0 for epoch in range(self.config.epochs): start = time.time() print("-------- Epoch {}/{} -------".format( epoch + 1, self.config.epochs)) for batch in DataHelper.next_batch(self.train_data, self.config.batch_size): current_step += 1 loss = self.model.train(sess, batch, self.config) perplexity = cal_perplexity(loss) if current_step % 100 == 0: print("train ---> step: {}, loss: {}, perplexity: {}". format(current_step, loss, perplexity)) if current_step % self.config.eval_every == 0: eval_losses = [] eval_perplexities = [] for eval_batch in DataHelper.next_batch( self.eval_data, self.config.batch_size): eval_loss = self.model.eval(sess, eval_batch) eval_perplexity = cal_perplexity(eval_loss) eval_losses.append(eval_loss) eval_perplexities.append(eval_perplexity) print("\n") print("eval ---> step: {}, loss: {}, perplexity: {}". format(current_step, mean(eval_losses), mean(eval_perplexities))) print("\n") # 保存checkpoint model ckpt_model_path = self.config.ckpt_model_path if not os.path.exists(ckpt_model_path): os.makedirs(ckpt_model_path) ckpt_model_path = os.path.join(ckpt_model_path, "model") self.model.saver.save(sess, ckpt_model_path, global_step=current_step) end = time.time() print("------time: {}----------".format(end - start))
def grab_core(core_words): cores = [] cores_w = [] # print('load model from file.') data_helper = DataHelper('test') edges_num, edges_matrix = edges_mapping1(len(data_helper.vocab), data_helper, 1) model = torch.load(os.path.join('temp_model.pkl')) content, label = data_helper.get_content() # eval_test(model,data_helper) edges_weights = model.seq_edge_w.weight.to('cpu').detach().numpy() graph_ed = [] for core_word in core_words: # core_index = data_helper.vocab.index(core_word) results = {} for i in range(len(data_helper.vocab)): word = data_helper.vocab[i] n_word = edges_matrix[i, core_index] # n_word = edges_matrix[i, i] if n_word != 0: cores.append(i) cores_w.append(word) else: continue return cores_w
def _clean_data(X, y): clean_X = DataFrame(columns=X.columns) clean_y = Series(name=y.name) skf = StratifiedKFold(n_splits=MajorityFiltering.k_folds, shuffle=True) for train_idxs, val_idxs in skf.split(X=range(len(y)), y=y): train_X = DataHelper.select_rows(X, train_idxs, copy=False) train_y = DataHelper.select_rows(y, train_idxs, copy=False) ensemble = MajorityFiltering.get_ensemble() ensemble.fit(train_X, train_y) val_X = DataHelper.select_rows(X, val_idxs, copy=False) predictions = ensemble.predict(val_X) maintain_idxs = [val_idxs[i] for i in range(len(val_idxs)) \ if predictions[i]==y.iloc[val_idxs[i]]] maintain_X = DataHelper.select_rows(X, maintain_idxs, copy=True) maintain_y = DataHelper.select_rows(y, maintain_idxs, copy=True) clean_X = clean_X.append(maintain_X, verify_integrity=True, sort=False) clean_y = clean_y.append(maintain_y, verify_integrity=True) return clean_X, clean_y
def __init__(self, double_model=False): """ Constructor :param double_model: Bool if model is single or double. Single model have one tree model for prediction of all days. Double model have one model for weekdays prediction and another model for weekend days prediction. """ self.dh = DataHelper() self.double_model = double_model
def test_reg(self, model_file): dhp = DataHelper() model = tf.keras.models.load_model(model_file) filenames, val_labels = dhp.create_test_gen( img_path=self.img_path, annotation_path=self.annotation_path) loss_eval = self._eval_model_reg(filenames, val_labels, model) print('=============Evaluation=================') print(loss_eval) print('========================================')
def __init__(self): self.data_helper_instance = DataHelper( input_dir='../data_2017', op_dir='../examples/dataset', preprocessing_cache='../examples/preprocessing_cache', stats_dir='../stats') self.stats_dir = "../stats" self.limited_interactions = 'target_user_interactions.csv' self.create_targets()
def test_accuracy_dynamic(self, model): dhp = DataHelper() '''create batches''' img_filenames, exp_filenames, spm_up_filenames, spm_md_filenames, spm_bo_filenames = \ dhp.create_generator_full_path_with_spm(img_path=self.img_path, annotation_path=self.anno_path) print(len(img_filenames)) exp_pr_lbl = [] exp_gt_lbl = [] dds = DynamicDataset() ds = dds.create_dataset(img_filenames=img_filenames, spm_up_filenames=spm_up_filenames, spm_md_filenames=spm_md_filenames, spm_bo_filenames=spm_bo_filenames, anno_names=exp_filenames, is_validation=True) batch_index = 0 for global_bunch, upper_bunch, middle_bunch, bottom_bunch, exp_gt_b in ds: '''predict on batch''' global_bunch = global_bunch[:, -1, :, :] upper_bunch = upper_bunch[:, -1, :, :] middle_bunch = middle_bunch[:, -1, :, :] bottom_bunch = bottom_bunch[:, -1, :, :] probab_exp_pr_b, _, _, _, _ = model.predict_on_batch( [global_bunch, upper_bunch, middle_bunch, bottom_bunch]) exp_pr_b = np.array([ np.argmax(probab_exp_pr_b[i]) for i in range(len(probab_exp_pr_b)) ]) exp_pr_lbl += np.array(exp_pr_b).tolist() exp_gt_lbl += np.array(exp_gt_b).tolist() batch_index += 1 exp_pr_lbl = np.int64(np.array(exp_pr_lbl)) exp_gt_lbl = np.int64(np.array(exp_gt_lbl)) global_accuracy = accuracy_score(exp_gt_lbl, exp_pr_lbl) conf_mat = confusion_matrix(exp_gt_lbl, exp_pr_lbl) / 500.0 # conf_mat = tf.math.confusion_matrix(exp_gt_lbl, exp_pr_lbl, num_classes=7)/500.0 ds = None face_img_filenames = None eyes_img_filenames = None nose_img_filenames = None mouth_img_filenames = None exp_filenames = None global_bunch = None upper_bunch = None middle_bunch = None bottom_bunch = None avg_accuracy = global_accuracy # the class numbers are the same in the validation return global_accuracy, conf_mat
def create_synthesized_landmarks(self, model_file, test_print=False): dhl = DataHelper() model = tf.keras.models.load_model(model_file) for i, file in tqdm(enumerate(os.listdir(self.img_path))): if file.endswith(".jpg") or file.endswith(".png"): dhl.create_synthesized_landmarks_path(img_path=self.img_path, anno_path=self.anno_path, file=file, model=model, test_print=test_print)
class LYFeatureIntegrate2(object): def __init__(self): self.d_h = DataHelper() def get_all_features(self, path=path_train01): start = time.time() data = self.d_h.get_data(path) fe = LYFeatureExtraction2(self.d_h, data) user_Y_list = self.d_h.get_user_Y_list(data) mt = ThreadingUtil() g_func_list = [] g_func_list.append({"func": fe.user_driver_time, "args": (data,)}) g_func_list.append({"func": fe.user_night_stat, "args": (data,)}) g_func_list.append({"func": fe.user_driver_stat, "args": (data,)}) g_func_list.append({"func": fe.get_distance, "args": (data,)}) g_func_list.append({"func": fe.user_direction__stat, "args": (data,)}) g_func_list.append({"func": fe.user_height_stat, "args": (data,)}) g_func_list.append({"func": fe.user_speed_stat, "args": (data,)}) mt.set_thread_func_list(g_func_list) mt.start() all_features_list = [[row[col] for row in mt.data_list] for col in range(len(mt.data_list[0]))] self.d_h.print_str += " get_train_features cost time: " + str(time.time() - start) + " " return all_features_list, user_Y_list def get_test_features02(self, path=path_test01): """ :param path: :return: """ start = time.time() data = self.d_h.get_test_data(path) userid_list = self.d_h.get_userlist(data) ft_Liyang = LYFeatureExtraction2(self.d_h, data) mt = ThreadingUtil() g_func_list = [] g_func_list.append({"func": ft_Liyang.user_driver_time, "args": (data,)}) g_func_list.append({"func": ft_Liyang.user_night_stat, "args": (data,)}) g_func_list.append({"func": ft_Liyang.user_driver_stat, "args": (data,)}) g_func_list.append({"func": ft_Liyang.get_distance, "args": (data,)}) g_func_list.append({"func": ft_Liyang.user_direction__stat, "args": (data,)}) g_func_list.append({"func": ft_Liyang.user_height_stat, "args": (data,)}) g_func_list.append({"func": ft_Liyang.user_speed_stat, "args": (data,)}) mt.set_thread_func_list(g_func_list) mt.start() test_features = [[row[col] for row in mt.data_list] for col in range(len(mt.data_list[0]))] self.d_h.print_str += " get_test_features cost time: " + str(time.time() - start) + " " return userid_list,test_features
def main(): for set_name in ConfigHelper.get_datasets(): MetricsHelper.reset_metrics() data, set_target = IOHelper.read_dataset(set_name) feats, labels = DataHelper.extract_feature_labels(data, set_target) DataHelper.create_label_mapping(labels) max_nb_feats = DataHelper.calculate_max_nb_features(feats) for e in range(ConfigHelper.nb_executions): start = time.time() print("Execution " + str(e)) train_idxs, test_idxs = DataHelper.split_in_sets(feats, labels) train_X = DataHelper.select_rows(feats, train_idxs, copy=False) train_y = DataHelper.select_rows(labels, train_idxs, copy=False) test_X = DataHelper.select_rows(feats, test_idxs, copy=False) test_y = DataHelper.select_rows(labels, test_idxs, copy=False) for noise_level in ConfigHelper.noise_levels: noisy_idxs, noisy_train_y = DataHelper.insert_noise( train_y, noise_level) for name, clf, clean_type in ConfigHelper.get_classifiers(): algorithm_data = ConfigHelper.choose_algorithm( clf, clean_type, train_X, noisy_train_y, noisy_idxs, max_nb_feats) chosen_rate = algorithm_data[0] chosen_threshold = algorithm_data[1] chosen_X = algorithm_data[2] chosen_y = algorithm_data[3] chosen_clf = algorithm_data[4] true_filtered = algorithm_data[5] false_filtered = algorithm_data[6] chosen_clf.fit(chosen_X, chosen_y) predictions = chosen_clf.predict(test_X) error = MetricsHelper.calculate_error_score( test_y, predictions) MetricsHelper.metrics.append([ set_name, e, noise_level, name, chosen_rate, chosen_threshold, error, true_filtered, false_filtered ]) print(str(time.time() - start)) IOHelper.store_results(MetricsHelper.convert_metrics_to_frame(), "final_" + set_name)
def create_from_orig(self, ds_type): print('create_from_orig & relabel to affectNetLike--->') """ labels are from 1-7, but we save them from 0 to 6 :param ds_type: :return: """ if ds_type == DatasetType.train: txt_path = RafDBConf.orig_annotation_txt_path load_img_path = RafDBConf.orig_image_path load_bbox_path = RafDBConf.orig_bounding_box save_img_path = RafDBConf.no_aug_train_img_path save_anno_path = RafDBConf.no_aug_train_annotation_path prefix = 'train' elif ds_type == DatasetType.test: txt_path = RafDBConf.orig_annotation_txt_path load_img_path = RafDBConf.orig_image_path load_bbox_path = RafDBConf.orig_bounding_box save_img_path = RafDBConf.test_img_path save_anno_path = RafDBConf.test_annotation_path prefix = 'test' '''read the text file, and save exp, and image''' file1 = open(txt_path, 'r') dhl = DataHelper() affectnet_like_lbls = [3, 4, 5, 1, 2, 6, 0] while True: line = file1.readline() if not line: break f_name = line.split(' ')[0] if prefix not in f_name: continue img_source_address = load_img_path + f_name[:-4] + '.jpg' img_dest_address = save_img_path + f_name exp = int(line.split(' ')[1]) - 1 '''relabel to affectNet''' exp = affectnet_like_lbls[exp] img = np.array(Image.open(img_source_address)) '''padd, resize image and save''' x_min, y_min, x_max, y_max = self.get_bounding_box( load_bbox_path + f_name[:-4] + '_boundingbox.txt') img = dhl.crop_image_bbox(img, x_min, y_min, x_max, y_max) '''resize''' res_img = resize(img, (InputDataSize.image_input_size, InputDataSize.image_input_size, 3), anti_aliasing=True) im = Image.fromarray(np.round(res_img * 255.0).astype(np.uint8)) im.save(img_dest_address) '''save annotation''' np.save(save_anno_path + f_name[:-4] + '_exp', exp) file1.close()
def do_show(self, name) -> None: """ Show the train or test data set :param name: train or valid or test """ if not self.data_set_helper: self.data_set_helper = DataHelper(self.data_path) self.data_set_helper.labels = self.labels_path try: self.data_set_helper.show_data_sets(name_of_dataset=name) except Exception as e: print(e)
def __init__(self, country='Russian Federation', years=None): if years and len(years) != 2: raise Exception("Must be two years...") if years is None: years = [2000, 2005] self.country = country self.years = years for year in years: self.data[year] = {} self.data_helper = DataHelper(country, years)
def create_au_mask(self): dhl = DataHelper() for i, file in tqdm(enumerate(os.listdir(self.img_path_aug))): if file.endswith(".jpg") or file.endswith(".png"): if os.path.exists(os.path.join(self.anno_path_aug, file[:-4] + "_exp.npy")) \ and os.path.exists(os.path.join(self.anno_path_aug, file[:-4] + "_slnd.npy")): if os.path.exists(os.path.join(self.anno_path_aug, file[:-4] + "_im.jpg")) or \ os.path.exists(os.path.join(self.anno_path_aug, file[:-4] + "_im.jpg")): continue dhl.create_AU_mask_path(img_path=self.img_path_aug, anno_path=self.anno_path_aug, file=file, test_print=False)
def create_masked_image(self): dhl = DataHelper() for i, file in tqdm(enumerate(os.listdir(self.img_path_aug))): if file.endswith(".jpg") or file.endswith(".png"): if os.path.exists(os.path.join(self.anno_path_aug, file[:-4] + "_exp.npy")) \ and os.path.exists(os.path.join(self.anno_path_aug, file[:-4] + "_slnd.npy")): '''load data''' lnd = np.load( os.path.join(self.anno_path_aug, file[:-4] + "_slnd.npy")) img_file_name = os.path.join(self.img_path_aug, file) img = np.float32(Image.open(img_file_name)) / 255.0 '''create masks''' dr_mask = np.expand_dims(dhl.create_derivative(img=img, lnd=lnd), axis=-1) au_mask = np.expand_dims(dhl.create_AU_mask(img=img, lnd=lnd), axis=-1) up_mask, mid_mask, bot_mask = dhl.create_spatial_mask( img=img, lnd=lnd) up_mask = np.expand_dims(up_mask, axis=-1) mid_mask = np.expand_dims(mid_mask, axis=-1) bot_mask = np.expand_dims(bot_mask, axis=-1) '''fuse images''' face_fused = dhl.create_input_bunches( img_batch=img, dr_mask_batch=dr_mask, au_mask_batch=au_mask, spatial_mask=None) eyes_fused = dhl.create_input_bunches( img_batch=img, dr_mask_batch=dr_mask, au_mask_batch=au_mask, spatial_mask=up_mask) nose_fused = dhl.create_input_bunches( img_batch=img, dr_mask_batch=dr_mask, au_mask_batch=au_mask, spatial_mask=mid_mask) mouth_fused = dhl.create_input_bunches( img_batch=img, dr_mask_batch=dr_mask, au_mask_batch=au_mask, spatial_mask=bot_mask) '''save fused''' savez_compressed( self.masked_img_path + file[:-4] + "_face", face_fused) savez_compressed( self.masked_img_path + file[:-4] + "_eyes", eyes_fused) savez_compressed( self.masked_img_path + file[:-4] + "_nose", nose_fused) savez_compressed( self.masked_img_path + file[:-4] + "_mouth", mouth_fused)
def train( data_helper: DataHelper, model: keras.Model, save_filename: str, batch_size=32, epochs=10 ): # 模型信息 model.summary() # 训练数据生成器 train_data_generator = data_helper.train_data_generator(batch_size) # 验证数据生成器 validation_data_generator = data_helper.validation_data_generator(batch_size) # 测试数据生成器 test_data_generator = data_helper.test_data_generator(batch_size) # 训练 model.fit( x=train_data_generator, steps_per_epoch=data_helper.train_data_count // batch_size, validation_data=validation_data_generator, validation_steps=data_helper.validation_data_count // batch_size, epochs=epochs, shuffle=True, callbacks=[ # 配置 tensorboard,将训练过程可视化,方便调参,tensorboard --logdir logs/fit keras.callbacks.TensorBoard( log_dir='logs/fit/' + datetime.now().strftime('%Y%m%d-%H%M%S'), histogram_freq=1 ), # 定时保存模型 keras.callbacks.ModelCheckpoint( filepath=save_filename, monitor='sparse_categorical_accuracy', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', save_freq='epoch' ) ], ) # 测试 model.evaluate( x=test_data_generator, steps=data_helper.test_data_count // batch_size, )
def create_affectnet(self, load_img_path, save_img_path, save_anno_path, img_path_arr, bbox_arr, landmarks_arr, expression_lbl_arr, valence_arr, arousal_arr, FLD_model_file_name, do_aug, is_7): # model = tf.keras.models.load_model(FLD_model_file_name) dhl = DataHelper() model = None if is_7: print('777777777777777777777777777777777777') print('++++++++++++| 7 labels |++++++++++++') print('777777777777777777777777777777777777') else: print('888888888888888888888888888888888888') print('++++++++++++| 8 labels |++++++++++++') print('888888888888888888888888888888888888') print('len(img_path_arr)') print(len(img_path_arr)) for i in tqdm(range(len(img_path_arr))): if is_7: if int(expression_lbl_arr[i]) == ExpressionCodesAffectnet.none or \ int(expression_lbl_arr[i]) == ExpressionCodesAffectnet.uncertain or \ int(expression_lbl_arr[i]) == ExpressionCodesAffectnet.contempt or \ int(expression_lbl_arr[i]) == ExpressionCodesAffectnet.noface: continue else: if int(expression_lbl_arr[i]) == ExpressionCodesAffectnet.none or \ int(expression_lbl_arr[i]) == ExpressionCodesAffectnet.uncertain or \ int(expression_lbl_arr[i]) == ExpressionCodesAffectnet.noface: continue '''crop, resize, augment image''' dhl.crop_resize_aug_img( load_img_name=load_img_path + img_path_arr[i], save_img_name=save_img_path + str(i) + '.jpg', bbox=bbox_arr[i], landmark=landmarks_arr[i], save_anno_name=save_anno_path + str(i) + '_lnd', synth_save_anno_name=save_anno_path + str(i) + '_slnd', model=model, do_aug=do_aug) '''save annotation: exp_lbl, valence, arousal, landmark ''' # print(str(int(expression_lbl_arr[i]))) # save(save_anno_path + str(i) + '_exp', str(int(expression_lbl_arr[i])-1)) save(save_anno_path + str(i) + '_exp', str(int(expression_lbl_arr[i]))) save(save_anno_path + str(i) + '_val', valence_arr[i]) save(save_anno_path + str(i) + '_aro', arousal_arr[i])
def create_derivative_mask(self): dhl = DataHelper() for i, file in tqdm(enumerate(os.listdir(self.img_path_aug))): if file.endswith(".jpg") or file.endswith(".png"): if os.path.exists(os.path.join(self.anno_path_aug, file[:-4] + "_exp.npy")) \ and os.path.exists(os.path.join(self.anno_path_aug, file[:-4] + "_slnd.npy")): # check if we have already created it: if os.path.exists(os.path.join(self.anno_path_aug + 'dmg/', file[:-4] + "_dmg.jpg")) or \ os.path.exists(os.path.join(self.anno_path_aug, file[:-4] + "_dmg.jpg")): continue dhl.create_derivative_path(img_path=self.img_path_aug, anno_path=self.anno_path_aug, file=file, test_print=False)
def cal_class_distribution(data_dir, level): """ calculate the class distribution :param data_dir: :param level: 0 for inner-sentence, 1 for inter-sentence but inner paragraph, 2 for inter-paragraph, 3 for different depth :return: None """ rst_trees = DataHelper.read_rst_trees(data_dir) all_nodes = [node for rst_tree in rst_trees for node in rst_tree.postorder_DFT(rst_tree.tree, [])] if level in [0, 1, 2]: valid_relations = [RstTree.extract_relation(node.child_relation) for node in all_nodes if node.level == level and node.child_relation is not None] distribution = Counter(valid_relations) for cla in class2rel: if cla not in distribution: distribution[cla] = 0 return distribution if level == 3: depth_relation_distributions = {} for node in all_nodes: if node.lnode is None and node.rnode is None: continue if node.depth in depth_relation_distributions: depth_relation_distributions[node.depth][RstTree.extract_relation(node.child_relation)] += 1 else: depth_relation_distributions[node.depth] = Counter() depth_relation_distributions[node.depth][RstTree.extract_relation(node.child_relation)] = 1 for depth, distribution in depth_relation_distributions.items(): for cla in class2rel: if cla not in distribution: distribution[cla] = 0 return depth_relation_distributions
def upsample_data(self): """we generate some samples so that all classes will have equal number of training samples""" dhl = DataHelper() '''count samples & categorize their address based on their category''' if self.ds_type == DatasetType.train: sample_count_by_class = np.zeros([8]) img_addr_by_class = [[] for i in range(8)] anno_addr_by_class = [[] for i in range(8)] lnd_addr_by_class = [[] for i in range(8)] else: sample_count_by_class = np.zeros([7]) img_addr_by_class = [[] for i in range(7)] anno_addr_by_class = [[] for i in range(7)] lnd_addr_by_class = [[] for i in range(7)] """""" print("counting classes:") for i, file in tqdm(enumerate(os.listdir(self.anno_path))): if file.endswith("_exp.npy"): exp = int(np.load(os.path.join(self.anno_path, file))) sample_count_by_class[exp] += 1 '''adding ex''' anno_addr_by_class[exp].append( os.path.join(self.anno_path, file)) img_addr_by_class[exp].append( os.path.join(self.img_path, file[:-8] + '.jpg')) lnd_addr_by_class[exp].append( os.path.join(self.anno_path, file[:-8] + '_slnd.npy')) print("sample_count_by_category: ====>>") print(sample_count_by_class) '''calculate augmentation factor for each class:''' aug_factor_by_class, aug_factor_by_class_freq = dhl.calculate_augmentation_rate( sample_count_by_class=sample_count_by_class, base_aug_factor=AffectnetConf.augmentation_factor) '''after we have calculated those two array, we will augment samples ''' for i in range(len(anno_addr_by_class)): dhl.do_random_augment(img_addrs=img_addr_by_class[i], anno_addrs=anno_addr_by_class[i], lnd_addrs=lnd_addr_by_class[i], aug_factor=int(aug_factor_by_class[i]), aug_factor_freq=int( aug_factor_by_class_freq[i]), img_save_path=self.img_path_aug, anno_save_path=self.anno_path_aug, class_index=i)
def mc_worker(jobs, stats, ctl, store, timeout=5): logging.info("mc_worker started") while ctl["run_ok"]: try: root, parents, val = jobs.get(block=True, timeout=timeout) except Queue.Empty: logging.debug("mc_worker hasn't received jobs for %s seconds" % timeout) continue start = time.time() for server in val: try: ip, port = server.split(":") except (ValueError, AttributeError), e: logging.error("unable to collect mc stats from %s : %s" % (server, e)) continue mc_server = Server(ip) # get bucket name from root and parent nodes bucket = DataHelper.get_bucket(root, parents) # initialize memcached source mc_source = MemcachedSource(mc_server, bucket) # initialize handlers to dump data json doc j_handler = JsonHandler() s_handler = SerieslyHandler(store) # collect data from source and emit to handlers mc_coll = MemcachedCollector([mc_source], [j_handler, s_handler]) mc_coll.collect() mc_coll.emit() stats.put([mc_source.fast, mc_source.meta], block=True) stats.put([mc_source.slow, mc_source.meta], block=True) delta = time.time() - start logging.debug("collected mc stats from %s, took %s seconds" % (val, delta)) if delta < timeout: logging.debug("mc_worker sleep for %s seconds" % (timeout - delta)) time.sleep(timeout - delta)