def plot_dense_big(cfg,nrow,ncol, st_cycle=1,st_cross=1): '''将可分性画在一张大画布上,共 sub_num 个小图,共 nrow*ncol 个格子''' for cycle in range(st_cycle, cfg.cycles+1): # 建立一张大画布 plt.figure() for cross in range(st_cross, cfg.cross_num+1): cfg.set_dir(cycle, cross) # 读取数据 print('\n', '--'*20, 'Start load data from dataset {} '.format(cfg.dt), '--'*20, '\n') __, __, val_x, val_y = load_data(cross, cfg,get_tr=False) # 若cfg_2不为None,则返回两种数据的列表 # load 模型, 并显示结构 mdl = load_mdl_file(cfg) if cross==0 and cycle==0: mdl.summary() # 画出模型在 dense 层的可分性,再保存图片 plot_dense_embed(mdl, cfg, nrow, ncol, val_x, val_y, cross) # 保存大画布到文件 plt.suptitle(cfg.mdl_nm+'__'+'cycle'+str(cycle)+'__'+'dense_separability') plt.savefig(cfg.save_one_dense_file, bbox_inches='tight') plt.close()
def plot_dense_all_sigle(cfg, st_cycle=1,st_cross=1): '''直接画出所有的 dense 可分性图片,不在main.py中调用 在没有这个图的时候,单独运行这个文件来补画 ''' for cycle in range(st_cycle,cfg.cycles+1): for cross in range(st_cross,cfg.cross_num+1): cfg.set_dir(cycle,cross) # 读取数据 print('\n', '--'*20, 'Start load data from dataset {} '.format(cfg.dt), '--'*20, '\n') __, __, val_x, val_y = load_data(cross, cfg,get_tr=False) # 若cfg_2不为None,则返回两种数据的列表 # load 模型, 并显示结构 mdl = load_mdl_file(cfg) if cross==0 and cycle==0: mdl.summary() # 画出模型在 dense 层的可分性,再保存图片 plot_dense_single(mdl,cfg,val_x,val_y,cross)
import numpy as np import tensorflow as tf from sklearn.model_selection import train_test_split from preprocess_data import load_data from tools import load_config, save_params ProcessedData = './processed_data/preprocess.p' ProcessedParams = './processed_data/params.p' ProcessedDataDir = './processed_data' try: title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = pickle.load( open(ProcessedDataDir + os.sep + 'preprocess.p', mode='rb')) except OSError: title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = load_data() embed_dim, uid_max, gender_max, age_max, job_max, movie_id_max, movie_categories_max, \ movie_title_max, combiner, sentences_size, window_sizes, filter_num = load_config() movieid2idx = {val[0]: i for i, val in enumerate(movies.values)} ''' Hyper parameters ''' # Number of Epochs num_epochs = 1 # Batch Size
print(f"Num GPUs Available: {len(physical_devices)}") tf.config.experimental.set_memory_growth(physical_devices[0], True) # Load test data test_in = open("toktest.pkl", "rb") test_set = pickle.load(test_in) x_test = remove_chars(test_set[0]) # Identify models to test, with their appropriate character conversion dictionaries and dictionary sizes text_name_1 = "Wb. Training Glosses" text_designation_1 = "Wb" one_text = [ rem_dubspace(" ".join(pickle.load(open("toktrain.pkl", "rb")))) ] mapping_1 = map_chars(load_data(one_text, text_name_1)) model_1 = "models\\models\\Wb-model, 2 layer(s) of 100 LSTM Nodes, 1 Dense, 250 Ep, No Bat, 10.0% Val" char_dict_1, rchardict_1, size_vocab_1 = mapping_1[0], mapping_1[ 1], mapping_1[2] text_name_2 = "Sg. Training Glosses" text_designation_2 = "Sg" two_text = [ rem_dubspace(" ".join( load_conllu('sga_dipsgg-ud-test_combined_POS.conllu'))) ] mapping_2 = map_chars(load_data(two_text, text_name_2)) model_2 = "models\\models\\Sg-model, 2 layer(s) of 100 LSTM Nodes, 1 Dense, 250 Ep, No Bat, 10.0% Val" char_dict_2, rchardict_2, size_vocab_2 = mapping_2[0], mapping_2[ 1], mapping_2[2]
# 若程序中断过,则存在这些结果文件,则将每一个都load进来继续训练,但其中的训练时间无法更新,所以,对于时间,看看就好 # 但这段程序,也让除了第0次,之后的每次训练,都会从前面的文件中load变量 print("\n\n\n",'#' * 70,"Start process of {} cross ".format(cross),'#' * 70) if os.path.exists(cfg.cnf_mtr_cycle_file): cnf_mtr_cycle = np.load(cfg.cnf_mtr_cycle_file) acc_list = np.load(cfg.acc_cycle_file) best_acc_list = np.load(cfg.best_acc_cycle_file) pre_list = np.load(cfg.pre_cycle_file) re_list = np.load(cfg.re_cycle_file) f1_list = np.load(cfg.f1_cycle_file) print("\nReloading results from file...... \n") # 读取数据 print('\n', '--'*20, 'Start load data from dataset {} '.format(cfg.dt), '--'*20, '\n') tr_x, tr_y, val_x, val_y = load_data(cross, cfg) # 在样本维度上进行数据的归一化处理 if cfg.is_Z_Norm: tr_x = Z_Norm(tr_x) val_x = Z_Norm(val_x) # 若需要调试,则减少数据 if cfg.rdc_smp: tr_x = tr_x[:cfg.rdc_smp_num] tr_y = tr_y[:cfg.rdc_smp_num] val_x = val_x[:cfg.rdc_smp_num] val_y = val_y[:cfg.rdc_smp_num] # 建立模型
def handler(): print('Analyse latest data...') t = pd.Timestamp(datetime.now()) latest_bucket = t.floor( '{}min'.format(Config.interval) ) # Assuming this will be very close to the (but still after) the wished timestamp convertJson.main( ) # transform json format of most recent tweets to csv format tweets = preprocess_data.load_data() filtered_tweets = preprocess_data.filter_spam(tweets) grid_tweets = preprocess_data.calc_grid(filtered_tweets) # create_time_series() can be used but here yields only one time slice # as the provided data is only the size of one interval timeslice, oldest = detect_crowded.create_time_series(grid_tweets) timeslice[0][1][1] = 200 timeseries = get_file(Config.helper_files, 'timeseries.p') #uncomment if you want to preserve the timeseries from your static data. timeseries.p will be overwritten for real time mode #pickle.dump(timeseries, open(Config.helper_files + 'timeseries_from_static.p', 'wb')) sl_window = int( (24 / (Config.interval / 60)) * Config.sliding_window ) # how many times does interval fit into day * sliding window timeseries = np.append(timeseries, timeslice, axis=0) pickle.dump(timeseries, open(Config.helper_files + 'timeseries.p', 'wb')) if len(timeseries) < Config.sliding_window: print( 'The timeseries is too small to detect crowded places. ' 'Please wait ', Config.sliding_window - len(timeseries), ' intervals more.') return crowded_places = detect_crowded.determine_crowded_per_cell_timeseries( timeseries, real_time_flag=True) if not crowded_places: print('No crowded places detected.') else: first_bucket = latest_bucket - pd.Timedelta(minutes=(len(timeseries) * Config.interval)) crowded_places = detect_crowded.check_amount_tweets( crowded_places, first_bucket) related_events_sample = analyse_crowded.get_details( grid_tweets, crowded_places) if not related_events_sample: print('No new events detected.') else: print('HURRAY! New events detected.') master_object = get_file(Config.interval, 'master_object.p') for key, value in related_events_sample.items(): master_object[key] = value pickle.dump(master_object, open(Config.results + 'master_object.p', 'wb')) timeseries = timeseries[ -sl_window:] # trim timeseries to sliding window size del_raw_data() # some deletion logic for raw data and outdated prep data del_prep_data() print('Processing for {0} finished. Next analysis is taking place at {1}'. format(latest_bucket, latest_bucket + Config.interval))
import progressbar from tqdm import tqdm import numpy as np import tensorflow as tf import keras as k # config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 4} ) # sess = tf.Session(config=config) # k.backend.set_session(sess) # import sys # np.set_printoptions(threshold=sys.maxsize) import preprocess_data as pre pre.clean_dataset() train_data, test_data = pre.load_data() word_to_index = pre.get_word_to_index(train_data) n_vocab = len(word_to_index) print(word_to_index) dialogs = pre.get_user_utterances(train_data, word_to_index) test_dialogs = pre.get_user_utterances(test_data, word_to_index) print(dialogs[0]) for turn in dialogs[0]: print(pre.index_to_word(turn, word_to_index)) food_dict, area_dict, pricerange_dict = pre.get_ontology(train_data) bs, bs_index, bs_onehot = pre.get_belief_states(train_data, food_dict, area_dict, pricerange_dict) test_bs, test_bs_index, test_bs_onehot = pre.get_belief_states( test_data, food_dict, area_dict, pricerange_dict) food_nb_classes = len(food_dict)
import preprocess_data, detect_crowded, convertJson, analyse_crowded import pandas as pd import ast, pickle, json from config import Config # ================================== Phase 1 - Load data ========================================================== # #transform tweets from JSON to csv format # #convertJson.main() previous_map_size = pickle.load(open(Config.helper_files + 'previous_map_size.p', 'rb')) if previous_map_size != Config.map_size: print('Mapsize was changed, therefore data must be preprocessed again. This can take a minute or two.') tweets = preprocess_data.load_data() tweets = preprocess_data.filter_spam(tweets) tweets = preprocess_data.calc_grid(tweets) tweets.to_csv(Config.helper_files+'tweets.csv', sep='\t', encoding='utf-8') pickle.dump(Config.map_size, open(Config.helper_files+'previous_map_size.p', 'wb')) # ================================== Phase 2 - Detect Crowded ===================================================== print('Detecting crowd...') tweets = pd.read_csv(Config.helper_files+'tweets.csv', parse_dates={'datetime': ['date']}, converters={'grid': ast.literal_eval}, # without pandas would load tuple as type string