def get_dataset(log_file, pretrain=True, is_time=False, triplet_loss=False): user_log = parse_log(log_file=log_file) # {username: [Message(content, timestamp)]} user_followers = load_followers("followers.csv") if pretrain: if not is_time: train_data = load_data(user_log, user_followers, pretrain) X_all, y_all, names_all = load_feat_data(train_data, pretrain=pretrain) else: X_all, y_all, names_all = load_temporal_feat(user_log, user_followers, pretrain) X_train, X_val, y_train, y_val, names_train, names_val = train_test_split(X_all, y_all, names_all, test_size=0.25, random_state=42) print("train name size", len(set(names_train))) print("val name size", len(set(names_val))) if triplet_loss: trainset = TripletChatData(X_train, y_train, names_train) else: trainset = ChatData(X_train, y_train, names_train) valset = ChatData(X_val, y_val, names_val) allset = ChatData(X_all, y_all, names_all) return trainset, valset, allset else: train_split = "train_small.csv" valid_split = "valid_small.csv" if not is_time: train_data = load_data(user_log, user_followers, pretrain, split=train_split) val_data = load_data(user_log, user_followers, pretrain, split=valid_split) X_train, y_train, names_train = load_feat_data(train_data, pretrain=pretrain) X_val, y_val, names_val = load_feat_data(val_data, pretrain=pretrain) else: X_train, y_train, names_train = load_temporal_feat(user_log, user_followers, pretrain, split=train_split) X_val, y_val, names_val = load_temporal_feat(user_log, user_followers, pretrain, split=valid_split) if triplet_loss: trainset = TripletChatData(X_train, y_train, names_train) else: trainset = ChatData(X_train, y_train, names_train) valset = ChatData(X_val, y_val, names_val) return trainset, valset
channel.queue_declare(queue='log-analysis') # Read weblogs f = open('weblogs.log', 'r', errors='ignore') while True: try: msg = f.readline() if not msg: break #If message is GET request, ingest it into the queue if is_get_request(msg): # Parse GET request for relevant information day, status, source = parse_log(msg) # Store in RabbitMQ body = json.dumps({ 'day': str(day), 'status': status, 'source': source }) channel.basic_publish(exchange='', routing_key='log-analysis', body=body) except: print("Unexpected error:" + sys.exc_info()[0]) connection.close()
import csv from utils import load_followers, parse_log import matplotlib.pyplot as plt from extract_feat import generate_one_hour_density_feature if __name__ == "__main__": print("extract temporal feature") data = parse_log("chat_log_pretrain.csv") user_density = generate_one_hour_density_feature(data) # user_followers = load_followers("followers.csv") # X = [] # y = [] # for user in user_density: # if user in user_followers: # density = user_density[user] # followers = user_followers[user] # if density > 10 or followers > 10000: # continue # X.append(density) # y.append(followers) # plt.scatter(X, y) # plt.show()
prev_time = m.timestamp feat_vector.append(curr_cnt) # print(user, len(feat_vector)) user_density[user] = feat_vector return user_density def calculate_chat_density(user_time): user_density = {} for username in user_time: times = user_time[username] if max(times) != min(times): density = len(times) / (max(times) - min(times)) if density > 1e-2: continue user_density[username] = density * 1e5 print(username, user_density[username]) return user_density if __name__ == "__main__": user_log = parse_log("chat_log_target.csv") # load_data(user_log, output_file="pretrain_data.txt") # print("generating model...") # model = train_unsupervised(input="pretrain_data.txt", model='skipgram', dim=500) # model.save_model("pretrain_token.bin") model = fastText.load_model("pretrain_token.bin") feat_path = "fasttext_feat_500_sent_target/" generate_fasttext_embeddings(user_log, model)
if _seed: stan_args['seed'] = _seed fit = sm.vb(data=data, tol_rel_obj=_tol_rel_obj, elbo_samples=_elbo_samples, grad_samples=_grad_samples, iter=_iter, sample_file=sample_path, diagnostic_file=sample_path + ".diag", algorithm=_variational, **stan_args) # parse the log file utils.convert_samples_to_nexus(tree, sample_path, tree_path, _rate) utils.parse_log(sample_path, 0.05) else: stan_args = {'seed': _seed} fit = sm.sampling(data=data, algorithm=_algorithm.upper(), sample_file=sample_path, chains=_chains, iter=_iter, thin=_thin, **stan_args) # chain=1 pystan uses sample_file if _chains == 1: if sample_path.endswith('.csv'): tree_path = sample_path.replace('.csv', '.trees') utils.convert_samples_to_nexus(tree, sample_path, tree_path, _rate)
process = open(process_logs, 'r').readlines() postprocess = open(postprocess_logs, 'r').readlines() manually_cleaned = open(manually_cleaned_logs, 'r').readlines() master = open(master_logs, 'r').readlines() # Define dict to store log contents logs = {'preprocess': [], 'process': [], 'postprocess': [], 'manually_cleaned': [], 'master': []} # parse preprocessing logs for line in preprocess: logs['preprocess'].append(parse_log(line)) # parse processing logs for line in process: logs['process'].append(parse_log(line)) # parse postprocessing logs for line in postprocess: logs['postprocess'].append(parse_log(line)) # parse combine logs for line in manually_cleaned: logs['manually_cleaned'].append(parse_log(line))
def test_local_fail(self): day, status, source = parse_log(sample_log[3]) self.assertEqual(status, '404') self.assertEqual(source, 'local') self.assertEqual(day, date(year=1995, month=10, day=11))
def test_remote_fail(self): day, status, source = parse_log(sample_log[1]) self.assertEqual(status, '404') self.assertEqual(source, 'remote') self.assertEqual(day, date(year=1994, month=10, day=24))
def main_3_5(make_table_figure=False): datatypes = ["Multi", "RGB"] splitcounts = [i for i in range(1, 11)] H = len(splitcounts) loss = nn.CrossEntropyLoss() if make_table_figure: illum_types = ["illum1", "illum2", "illum3", "normal"] positions = [i + 1 for i in range(7)] # 共7个拍摄角度,编号 1 ~ 7 glass_types = [1, 5, 6] # `1`表示无眼镜,`5`表示戴眼镜,`6`表示太阳镜 for datatype in datatypes: print("Generating tables and figures [{}]...".format(datatype)) usedChannels = [i for i in range(1, 26)] \ if datatype == "Multi" else "RGB" data_acc_illum_types = np.zeros(shape=(H, len(illum_types))) data_loss_illum_types = np.zeros(shape=(H, len(illum_types))) data_acc_positions = np.zeros(shape=(H, len(positions ))) data_loss_positions = np.zeros(shape=(H, len(positions ))) data_acc_glass_types = np.zeros(shape=(H, len(glass_types))) data_loss_glass_types = np.zeros(shape=(H, len(glass_types))) for i in range(len(splitcounts)): splitcount = splitcounts[i] ## 获取configer configer = get_configer(datatype=datatype, splitcount=splitcount, usedChannels=usedChannels) ## 读取保存的测试结果 logpath = os.path.join(configer.logspath, configer.modelname) print(logpath) y_pred_prob = np.load(os.path.join(logpath, 'test_out.npy')) ## 读取文件列表 testset = RecognizeDataset(configer.datapath, configer.datatype, configer.splitmode, 'test', configer.usedChannels) y_true = np.array(list(map(lambda x: x[1], testset.samplelist))) test_list = testset.filelist test_attr_list = list(map(lambda x: ImageAttributes(x), test_list)) del testset ## 分析光照 for j in range(len(illum_types)): illum_type = illum_types[j] # FIXME: # index = list(map(lambda x: x.illum_type==illum_type, test_attr_list)) index = list(map(lambda x: x.illum_type==illum_type and \ # x.glass_type==1 and \ x.position==4, test_attr_list)) index = np.array(index, dtype=np.bool) y_pred_prob_sub = torch.tensor(y_pred_prob[index]) y_true_sub = torch.tensor(y_true[index]) data_acc_illum_types [i, j] = accuracy(y_pred_prob_sub, y_true_sub).cpu().numpy() data_loss_illum_types[i, j] = loss(y_pred_prob_sub, y_true_sub).cpu().numpy() ## 分析位置 for j in range(len(positions)): position = positions[j] # FIXME: # index = list(map(lambda x: x.position==position, test_attr_list)) index = list(map(lambda x: x.position==position and \ x.glass_type==1, test_attr_list)) index = np.array(index, dtype=np.bool) y_pred_prob_sub = torch.tensor(y_pred_prob[index]) y_true_sub = torch.tensor(y_true[index]) data_acc_positions [i, j] = accuracy(y_pred_prob_sub, y_true_sub).cpu().numpy() data_loss_positions[i, j] = loss(y_pred_prob_sub, y_true_sub).cpu().numpy() ## 分析眼镜 for j in range(len(glass_types)): glass_type = glass_types[j] index = list(map(lambda x: x.glass_type==glass_type, test_attr_list)) index = np.array(index, dtype=np.bool) y_pred_prob_sub = torch.tensor(y_pred_prob[index]) y_true_sub = torch.tensor(y_true[index]) data_acc_glass_types [i, j] = accuracy(y_pred_prob_sub, y_true_sub).cpu().numpy() data_loss_glass_types[i, j] = loss(y_pred_prob_sub, y_true_sub).cpu().numpy() ## 绘制表格 rows_name = [str(i) for i in splitcounts] + ['average'] # ===================================================================================================== head_name = "count/光照" cols_name = illum_types # ----------------------------------------------------------------------------------------------------- data_acc_illum_types = np.r_[data_acc_illum_types, np.mean(data_acc_illum_types, axis=0).reshape(1, -1)] data_loss_illum_types = np.r_[data_loss_illum_types, np.mean(data_loss_illum_types, axis=0).reshape(1, -1)] # ----------------------------------------------------------------------------------------------------- table_acc_illum_types = gen_markdown_table_2d(head_name, rows_name, cols_name, data_acc_illum_types ) table_loss_illum_types = gen_markdown_table_2d(head_name, rows_name, cols_name, data_loss_illum_types) with open("images/3_5_<table>_[{}]_[illum_types].txt".format(datatype), 'w') as f: f.write("\n\nacc\n") f.write(table_acc_illum_types) f.write("\n\nloss\n") f.write(table_loss_illum_types) # ----------------------------------------------------------------------------------------------------- plt.figure() plt.subplot(121); plt.title("acc") avg_acc = data_acc_illum_types[-1] plt.bar(np.arange(avg_acc.shape[0]), avg_acc ) plt.subplot(122); plt.title("loss") avg_loss = data_loss_illum_types[-1] plt.bar(np.arange(avg_loss.shape[0]), avg_loss) plt.savefig("images/3_5_<figure>_[{}]_[illum_types].png".format(datatype)) # ===================================================================================================== head_name = "count/位置" cols_name = list(map(str, positions)) # ----------------------------------------------------------------------------------------------------- data_acc_positions = np.r_[data_acc_positions, np.mean(data_acc_positions, axis=0).reshape(1, -1)] data_loss_positions = np.r_[data_loss_positions, np.mean(data_loss_positions, axis=0).reshape(1, -1)] # ----------------------------------------------------------------------------------------------------- table_acc_positions = gen_markdown_table_2d(head_name, rows_name, cols_name, data_acc_positions ) table_loss_positions = gen_markdown_table_2d(head_name, rows_name, cols_name, data_loss_positions) with open("images/3_5_<table>_[{}]_[positions].md".format(datatype), 'w') as f: f.write("\n\nacc\n") f.write(table_acc_positions) f.write("\n\nloss\n") f.write(table_loss_positions) # ----------------------------------------------------------------------------------------------------- plt.figure() plt.subplot(121); plt.title("acc") avg_acc = data_acc_positions[-1] plt.bar(np.arange(avg_acc.shape[0]), avg_acc ) plt.subplot(122); plt.title("loss") avg_loss = data_loss_positions[-1] plt.bar(np.arange(avg_loss.shape[0]), avg_loss) plt.savefig("images/3_5_<figure>_[{}]_[positions].png".format(datatype)) # ===================================================================================================== head_name = "count/眼镜" cols_name = list(map(str, glass_types)) # ----------------------------------------------------------------------------------------------------- data_acc_glass_types = np.r_[data_acc_glass_types, np.mean(data_acc_glass_types, axis=0).reshape(1, -1)] data_loss_glass_types = np.r_[data_loss_glass_types, np.mean(data_loss_glass_types, axis=0).reshape(1, -1)] # ----------------------------------------------------------------------------------------------------- table_acc_glass_types = gen_markdown_table_2d(head_name, rows_name, cols_name, data_acc_glass_types ) table_loss_glass_types = gen_markdown_table_2d(head_name, rows_name, cols_name, data_loss_glass_types) with open("images/3_5_<table>_[{}]_[glass_types].txt".format(datatype), 'w') as f: f.write("\n\nacc\n") f.write(table_acc_glass_types) f.write("\n\nloss\n") f.write(table_loss_glass_types) # ----------------------------------------------------------------------------------------------------- plt.figure() plt.subplot(121); plt.title("acc") avg_acc = data_acc_glass_types[-1] plt.bar(np.arange(avg_acc.shape[0]), avg_acc ) plt.subplot(122); plt.title("loss") avg_loss = data_loss_glass_types[-1] plt.bar(np.arange(avg_loss.shape[0]), avg_loss) plt.savefig("images/3_5_<figure>_[{}]_[glass_types].png".format(datatype)) return start_time = time.time(); elapsed_time = 0 for datatype in datatypes: usedChannels = [i for i in range(1, 26)] \ if datatype == "Multi" else "RGB" data_acc = np.zeros(H) data_loss = np.zeros(H) for i in range(len(splitcounts)): splitcount = splitcounts[i] configer = get_configer(datatype=datatype, splitcount=splitcount, usedChannels=usedChannels) elapsed_time += time.time() - start_time start_time = time.time() print("Main 3.2 [{}] [{}] {}... Elaped >>> {} min".\ format(configer.datatype, configer.splitmode, usedChannels, elapsed_time/60)) logpath = os.path.join(configer.logspath, configer.modelname) print(logpath) if os.path.exists(logpath): with open(os.path.join(logpath, 'test_log.txt'), 'r') as f: test_log = f.readlines()[0] data_acc[i], data_loss[i] = parse_log(test_log) print(test_log) else: train(configer) data_acc[i], data_loss[i] = test(configer) print("-------------------------------------------------") ## 保存数据 avg_acc = np.mean(data_acc, axis=0) avg_loss = np.mean(data_loss, axis=0) table_data_acc = np.r_[data_acc, avg_acc ] table_data_loss = np.r_[data_loss, avg_loss] table_data = np.r_[table_data_acc.reshape(1, -1), table_data_loss.reshape(1, -1)] np.savetxt("images/3_5_<data>_[{}].txt".format(datatype), table_data)
def main_3_4(make_table_figure=False): ORDER = [i+1 for i in range(25)] usedChannels_list = [ORDER[: : i+1] for i in range(len(ORDER))] splitcounts = [i for i in range(1, 11)] H, W = len(splitcounts), len(usedChannels_list) if make_table_figure: print("Generating tables and figures [Multi]...") table_data = np.loadtxt("images/3_4_<data>_[Multi].txt") table_data_acc, table_data_loss = np.vsplit(table_data, 2) # 按竖直方向划分为两块,即上下 ## 做表格 head_name = "count/波段步长" rows_name = [str(i) for i in splitcounts] + ['average'] cols_name = [str(i+1) for i in range(W)] table_acc = gen_markdown_table_2d(head_name, rows_name, cols_name, table_data_acc) table_loss = gen_markdown_table_2d(head_name, rows_name, cols_name, table_data_loss) with open("images/3_4_<table>_[Multi].md", 'w') as f: f.write("\n\nacc\n") f.write(table_acc) f.write("\n\nloss\n") f.write(table_loss) ## 作图 plt.figure() plt.subplot(121); plt.title("acc") avg_acc = table_data_acc[-1] plt.bar(np.arange(avg_acc.shape[0]), avg_acc ) plt.subplot(122); plt.title("loss") avg_loss = table_data_loss[-1] plt.bar(np.arange(avg_loss.shape[0]), avg_loss) plt.savefig("images/3_4_<figure>_[Multi].png") return start_time = time.time(); elapsed_time = 0 data_acc = np.zeros(shape=(H, W)) data_loss = np.zeros(shape=(H, W)) for i in range(len(splitcounts)): splitcount = splitcounts[i] for j in range(len(usedChannels_list)): usedChannels = usedChannels_list[j] configer = get_configer(splitcount=splitcount, usedChannels=usedChannels) elapsed_time += time.time() - start_time start_time = time.time() print("Main 3.4 [Multi] [{}] {}... Elaped >>> {} min".\ format(configer.splitmode, usedChannels, elapsed_time/60)) logpath = os.path.join(configer.logspath, configer.modelname) print(logpath) if os.path.exists(logpath): with open(os.path.join(logpath, 'test_log.txt'), 'r') as f: test_log = f.readlines()[0] data_acc[i, j], data_loss[i, j] = parse_log(test_log) print(test_log) else: train(configer) data_acc[i, j], data_loss[i, j] = test(configer) print("-------------------------------------------------") ## 保存数据 avg_acc = np.mean(data_acc, axis=0) avg_loss = np.mean(data_loss, axis=0) table_data_acc = np.r_[data_acc, avg_acc.reshape(1, -1) ] table_data_loss = np.r_[data_loss, avg_loss.reshape(1, -1)] table_data = np.r_[table_data_acc, table_data_loss] np.savetxt("images/3_4_<data>_[Multi].txt", table_data)
def main_3_2(make_table_figure=False): datatypes = ["Multi", "RGB"] splitcounts = [i for i in range(1, 11)] H = len(splitcounts) if make_table_figure: for datatype in datatypes: usedChannels_list = [[i] for i in range(1, 26)] \ if datatype == "Multi" else ["R", "G", "B"] W = len(usedChannels_list) print("Generating tables and figures [{}]...".format(datatype)) table_data = np.loadtxt("images/3_2_<data>_[{}].txt".format(datatype)) table_data_acc, table_data_loss = np.vsplit(table_data, 2) # 按竖直方向划分为两块,即上下 ## 做表格 head_name = "count/波段索引" rows_name = [str(i) for i in splitcounts] + ['average'] cols_name = [str(i[0]) for i in usedChannels_list] table_acc = gen_markdown_table_2d(head_name, rows_name, cols_name, table_data_acc) table_loss = gen_markdown_table_2d(head_name, rows_name, cols_name, table_data_loss) with open("images/3_2_<table>_[{}].md".format(datatype), 'w') as f: f.write("\n\nacc\n") f.write(table_acc) f.write("\n\nloss\n") f.write(table_loss) ## 作图 plt.figure() plt.subplot(121); plt.title("acc") avg_acc = table_data_acc[-1] plt.bar(np.arange(avg_acc.shape[0]), avg_acc ) plt.subplot(122); plt.title("loss") avg_loss = table_data_loss[-1] plt.bar(np.arange(avg_loss.shape[0]), avg_loss) plt.savefig("images/3_2_<figure>_[{}].png".format(datatype)) ## 输出最优波段排序,依据准确率 avg_acc = table_data_acc[-1] order = np.argsort(avg_acc)[::-1] + 1 print("Best: ", order) return start_time = time.time(); elapsed_time = 0 for datatype in datatypes: usedChannels_list = [[i] for i in range(1, 26)] \ if datatype == "Multi" else ["R", "G", "B"] W = len(usedChannels_list) data_acc = np.zeros(shape=(H, W)) data_loss = np.zeros(shape=(H, W)) for i in range(len(splitcounts)): splitcount = splitcounts[i] for j in range(len(usedChannels_list)): usedChannels = usedChannels_list[j] configer = get_configer(datatype=datatype, splitcount=splitcount, usedChannels=usedChannels) elapsed_time += time.time() - start_time start_time = time.time() print("Main 3.2 [{}] [{}] {}... Elaped >>> {} min".\ format(configer.datatype, configer.splitmode, usedChannels, elapsed_time/60)) logpath = os.path.join(configer.logspath, configer.modelname) print(logpath) if os.path.exists(logpath): with open(os.path.join(logpath, 'test_log.txt'), 'r') as f: test_log = f.readlines()[0] data_acc[i, j], data_loss[i, j] = parse_log(test_log) print(test_log) else: train(configer) data_acc[i, j], data_loss[i, j] = test(configer) print("-------------------------------------------------") ## 保存数据 avg_acc = np.mean(data_acc, axis=0) avg_loss = np.mean(data_loss, axis=0) table_data_acc = np.r_[data_acc, avg_acc.reshape(1, -1) ] table_data_loss = np.r_[data_loss, avg_loss.reshape(1, -1)] table_data = np.r_[table_data_acc, table_data_loss] np.savetxt("images/3_2_<data>_[{}].txt".format(datatype), table_data)
def main_3_1(make_table_figure=False): datatypes = ["Multi", "RGB"] splitcounts = [i for i in range(1, 11)] trains = [0.1*(i + 1) for i in range(7)] H, W = len(splitcounts), len(trains) if make_table_figure: for datatype in datatypes: print("Generating tables and figures [{}]...".format(datatype)) table_data = np.loadtxt("images/3_1_<data>_[{}].txt".format(datatype)) table_data_acc, table_data_loss = np.vsplit(table_data, 2) # 按竖直方向划分为两块,即上下 ## 做表格 head_name = "count/比例" rows_name = [str(i) for i in splitcounts] + ['average'] cols_name = ["{:.2f}: {:.2f}: 0.2".format(i, 0.8 - i) for i in trains] table_acc = gen_markdown_table_2d(head_name, rows_name, cols_name, table_data_acc) table_loss = gen_markdown_table_2d(head_name, rows_name, cols_name, table_data_loss) with open("images/3_1_<table>_[{}].md".format(datatype), 'w') as f: f.write("\n\nacc\n") f.write(table_acc) f.write("\n\nloss\n") f.write(table_loss) ## 作图 plt.figure() plt.subplot(121); plt.title("acc") avg_acc = table_data_acc[-1] plt.bar(np.arange(avg_acc.shape[0]), avg_acc ) plt.subplot(122); plt.title("loss") avg_loss = table_data_loss[-1] plt.bar(np.arange(avg_loss.shape[0]), avg_loss) plt.savefig("images/3_1_<figure>_[{}].png".format(datatype)) return start_time = time.time(); elapsed_time = 0 for datatype in datatypes: data_acc = np.zeros(shape=(H, W)) data_loss = np.zeros(shape=(H, W)) for i in range(H): # 1, 2, ..., 5 splitcount = splitcounts[i] TEST = 0.2 for j in range(W): valid = 1 - TEST - trains[j] splitratio = [trains[j], valid, TEST] configer = get_configer(datatype=datatype, splitratio=splitratio, splitcount=splitcount, usedChannels=[i+1 for i in range(25)] if datatype=="Multi" else "RGB") elapsed_time += time.time() - start_time start_time = time.time() print("Main 3.1 [{}] [{}]... Elaped >>> {} min".\ format(configer.datatype, configer.splitmode, elapsed_time/60)) logpath = os.path.join(configer.logspath, configer.modelname) print(logpath) if os.path.exists(logpath): with open(os.path.join(logpath, 'test_log.txt'), 'r') as f: test_log = f.readlines()[0] data_acc[i, j], data_loss[i, j] = parse_log(test_log) print(test_log) else: train(configer) data_acc[i, j], data_loss[i, j] = test(configer) print("-------------------------------------------------") ## 保存数据 avg_acc = np.mean(data_acc, axis=0) avg_loss = np.mean(data_loss, axis=0) table_data_acc = np.r_[data_acc, avg_acc.reshape(1, -1) ] table_data_loss = np.r_[data_loss, avg_loss.reshape(1, -1)] table_data = np.r_[table_data_acc, table_data_loss] np.savetxt("images/3_1_<data>_[{}].txt".format(datatype), table_data)