def prepare_user_sessions(source_file, data_dir): pipes = Pipes(data_dir, suffix=".user_session.csv") with lzma.open(source_file, 'rt') as sf: plines = 0 t = time.time() for line in sf: plines += 1 if plines % MONITOR_LINES == 0: print ("processed lines: %d mem: %rMB, lines/s: %r" % (plines, float(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) / 1024, int(MONITOR_LINES / (time.time() - t)) ) ) t = time.time() # if int(plines) >= int(1000000): # print("BREAK") # break elems = line.split('|') if len(elems) <= EXECUTION_TIME: continue if not elems[START_TIME].isdigit(): print ("bad line: %s" % line) continue ts = int(elems[START_TIME]) user_id = elems[USER_ID] host_id = elems[HOST_ID] execution_time = 0 if elems[EXECUTION_TIME].isdigit(): execution_time = int(elems[EXECUTION_TIME]) op = Operation() if elems[REQUEST] == 'GET': if not elems[FILE_SIZE].isdigit(): continue op.ts = ts op.optype = 'g' op.obj_id = obj_id = get_md5(elems[PARAMS].strip(), hexdigest=True) op.parent_dir_id = get_md5(os.path.dirname(elems[PARAMS].strip()), hexdigest=True) op.size = int(elems[FILE_SIZE]) op.execution_time = execution_time pipes.write_to(user_id + "_" + host_id, str(op)) elif elems[REQUEST] == 'PUT': if not elems[FILE_SIZE].isdigit(): continue op.ts = ts op.optype = 'p' op.obj_id = obj_id = get_md5(elems[PARAMS].strip(), hexdigest=True) op.parent_dir_id = get_md5(os.path.dirname(elems[PARAMS].strip()), hexdigest=True) op.size = int(elems[FILE_SIZE]) op.execution_time = execution_time pipes.write_to(user_id + "_" + host_id, str(op)) elif elems[REQUEST] == 'DEL': op.ts = ts op.optype = 'd' op.obj_id = obj_id = get_md5(elems[PARAMS].strip(), hexdigest=True) op.parent_dir_id = get_md5(os.path.dirname(elems[PARAMS].strip()), hexdigest=True) op.execution_time = execution_time pipes.write_to(user_id + "_" + host_id, str(op)) elif elems[REQUEST] == 'RENAME': op.ts = ts op.optype = 'r' op.obj_id = obj_id = get_md5(elems[PARAMS].strip(), hexdigest=True) op.parent_dir_id = get_md5(os.path.dirname(elems[PARAMS].strip()), hexdigest=True) op.execution_time = execution_time pipes.write_to(user_id + "_" + host_id, str(op)) pipes.close()
] #remove the old log file, as outpipe is append only. if os.path.exists(os.path.join(results_dir, target_file_name)): os.remove(os.path.join(results_dir, target_file_name)) out_pipe = Pipes(results_dir) csv_header = ";".join([ "user_id", "from_ts", "till_ts", "session_lifetime", "get_requests", "reget_requests", "put_requests", "get_bytes", "put_bytes", "rename_requests", "del_requests", "get_dirs", "put_dirs", "put_files_per_dir", "get_files_per_dir", "window_seconds" ]) out_pipe.write_to(target_file_name, csv_header) cnt = 0 for sf in users_session_files: cnt += 1 print("working on %d/%d" % (cnt, len(users_session_files))) analyze_user_session(sf, out_pipe, target_file_name) # if cnt >=20: # break out_pipe.close() print("wrote results to %s: " % (os.path.join(results_dir, target_file_name)))