def get_logdata_df(): ''' 获取所需写入数据库的数据 返回dataframe,”用户,app,duration“ ''' file_path_all=get_file('conn') #file_path_all='D:\\ipv4&6\\00-pyTraff\\test_file\\conn.gz' #文件不存在直接return if not os.path.exists(file_path_all): return pd.DataFrame() #读取文件 df=read_file.pandas_normal_gz(file_path_all) df=df.iloc[:-1,[0,2,5]]#ts,origIP,resp_port df=df.dropna(how='any') grouped=df.groupby([2,5]) db_port=get_data_base_port()#数据库中收录的app, #对组开始处理 np_list=[] for gp in grouped.groups: data_port=gp[1] if data_port in db_port.iterkeys(): gp_df=grouped.get_group(gp)#求时间 min_ts=gp_df.iloc[:,0].min() if min_ts<get_past_hour_ts(1): min_ts=get_past_hour_ts(1) max_ts=gp_df.iloc[:,0].max() duration=round(max_ts-min_ts,3) np_list.append([gp[0],db_port[data_port],duration]) df_result=pd.DataFrame(np_list,columns=['user','app','duration']) #print df_result return df_result
def get_conn_content(): file_path_all = get_file('conn') #file_path_all='D:\\ipv4&6\\00-pyTraff\\test_file\\7_11_conn.gz' if not os.path.exists(file_path_all): return pd.DataFrame() df = read_file.pandas_normal_gz(file_path_all) df_drop = df.iloc[:, [1, 2, 17, 19]] df_drop.columns = ['fid', 'user', 'orig_ip_bytes', 'resp_ip_bytes'] return df_drop
def get_http_video(): file_path_all = get_file('http') #file_path_all='D:\\ipv4&6\\00-pyTraff\\test_file\\7_17_http.gz' if not os.path.exists(file_path_all): return pd.DataFrame() df = read_file.pandas_normal_gz(file_path_all) df_drop = df.iloc[:, [0, 1, 12]] usagent_content = df_drop.dropna(axis=0) #过滤内容为空的 result = usagent_content.groupby([1, 12]) #fid,user-agent-content app_use = [] #fid,content for app in result.groups: fid = app[0] app_name = app[1] if app_name.startswith('Mozilla') or app_name.startswith( 'Dalvik') or app_name.startswith( 'Safari') or app_name.startswith('Opera'): continue elif app_name.startswith('Youku') or app_name.startswith( 'youku-tudou'): app_use.append([fid, 'Youku']) elif app_name.startswith('MGTV'): app_use.append([fid, 'MGTV']) elif app_name.startswith('SOHUVideo'): app_use.append([fid, 'SOHUVideo']) elif app_name.startswith('QYPlayer'): app_use.append([fid, 'QYPlayer']) elif app_name.startswith('qqlive'): app_use.append([fid, 'qqlive']) elif app_name.startswith('kwai'): app_use.append([fid, 'kwai']) elif app_name.startswith('PPStream'): app_use.append([fid, 'PPStream']) elif app_name.startswith('Letv'): app_use.append([fid, 'Letv']) elif app_name.startswith('Funshion'): app_use.append([fid, 'Funshion']) elif app_name.startswith('Xfplay'): app_use.append([fid, 'Xfplay']) else: continue df_app_use = pd.DataFrame(app_use, columns=['fid', 'video_name']) return df_app_use
def get_data(filename): ''' 获取所要写入数据 @param filename:文件路径 @return:日志文件,总连接数,总流量,目标流量,源流量 ''' log_file=get_log_file_size() df=read_file.pandas_normal_gz(filename) conn_all=long(df[1].count()) traff_resp=long(df[19].sum()) traff_orig=long(df[17].sum()) traff_all=traff_orig+traff_resp return log_file,conn_all,traff_all,traff_orig,traff_resp
def get_log(): ''' 获取log所需信息,返回df ''' df=pd.DataFrame() file_path_all=get_file('conn') #file_path_all='test_file/conn.gz' if os.path.exists(file_path_all): log_df=read_file.pandas_normal_gz(file_path_all) df=log_df.iloc[:-1,[6,3,5,2]] df.rename(columns={6:'protocol',3:'orig_port',5:'resp_port',2:'orig_ip'},inplace = True) else: return df return df
def get_data(): ''' 获取所需有效日志文件数据 @return :dataframe ''' #调取read_file模块函数读取文件 #df_conn=read_file.pandas_normal('conn.log') file_all_path=get_file() df_conn=read_file.pandas_normal_gz(file_all_path) df_conn.rename(columns={2:'orignIp',4:'respIp',17:'orign',19:'resp'},inplace=True) #获取所需要的信息 df_conn_useful=df_conn.iloc[:-1,[2,4,17,19]]#uid,orig_ip_bytes,resp_ip_bytes df_all=df_conn_useful.groupby(['orignIp','respIp']).sum() df_all['results']=df_all['orign']+df_all['resp'] df_results=df_all.sort_values(by='results',ascending=False).head(10) return df_results
def get_data(): ''' 获取所需数据 ''' file_path_all = get_file('conn') #file_path_all='conn.gz' if not os.path.exists(file_path_all): return 0 else: all_data = read_file.pandas_normal_gz(file_path_all) value = all_data.iloc[:-1, [7, 17, 19]] #service,orig_bytes,resp_bytes useful_data = value.dropna(how='any') # 去掉包含缺失值的行 useful_data.rename(columns={7: 'service'}, inplace=True) useful_data.rename(columns={17: 'orig_bytes'}, inplace=True) useful_data.rename(columns={19: 'resp_bytes'}, inplace=True) data = useful_data.groupby('service').sum() return data
def get_logdata_df(): ''' 获取所需写入数据库的数据 返回dataframe,”用户,host,duration“ ''' pattern=#根据网络环境修改 file_path_all=get_file('http') #file_path_all='test_file/http.gz' if not os.path.exists(file_path_all): return pd.DataFrame() df=read_file.pandas_normal_gz(file_path_all) df=df[df.iloc[:,2].str.match(pattern)] df=df.iloc[:-1,[0,2,8]]#ts,origIP,host df=df.dropna(how='any') grouped=df.groupby([2,8]) db_host=get_data_base_host()#数据库中收录的网站host np_list=[] for gp in grouped.groups: data_host=gp[1].split('.') if len(data_host)>1: if data_host[-2] in ['com','cn','net','gov','org']: data_host=data_host[-3] else: data_host=data_host[-2] if data_host in db_host.iterkeys(): gp_df=grouped.get_group(gp) min_ts=gp_df.iloc[:,0].min() if min_ts<get_past_hour_ts(1): min_ts=get_past_hour_ts(1) max_ts=gp_df.iloc[:,0].max() np_list.append([gp[0],db_host[data_host],min_ts]) np_list.append([gp[0],db_host[data_host],max_ts]) df_result=pd.DataFrame(np_list,columns=['user','webhost','ts']) result_group=df_result.groupby(['user','webhost']) np_list2=[] for gp in result_group.groups: gp_df=result_group.get_group(gp) min_ts=gp_df.iloc[:,2].min() max_ts=gp_df.iloc[:,2].max() duration=round(max_ts-min_ts,3) np_list2.append([gp[0],gp[1],duration]) df_result=pd.DataFrame(np_list2,columns=['user','webhost','duration']) return df_result
def get_log(): ''' 获取log所需信息,返回df ''' df = pd.DataFrame() file_path_all = get_file('conn') #file_path_all='test_file/conn.gz' if os.path.exists(file_path_all): log_df = read_file.pandas_normal_gz(file_path_all) df = log_df.iloc[:-1, [6, 3, 5, 2]] df.rename(columns={ 6: 'protocol', 3: 'orig_port', 5: 'resp_port', 2: 'orig_ip' }, inplace=True) else: return df return df
def get_file_type(): ''' 获取文件类型与累计出现次数 返回一个df数据类型 ''' type = [] file_type_count = [] file_path_all = get_file('files') #file_path_all='D:\\ipv4&6\\00-pyTraff\\test_file\\12files.gz' if not os.path.exists(file_path_all): return pd.DataFrame() df = read_file.pandas_normal_gz(file_path_all) files_content = df.iloc[:, [8]] #得到[file_type] files_content = files_content.dropna(axis=0) #过滤内容为空的 z = files_content.groupby([8]).size().sort_values().tail(15) for i in z.index: type.append(i) for j in z.values: file_type_count.append(j) result = zip(type, file_type_count) df_type_result = pd.DataFrame(result, columns=['file_type', 'count1']) return df_type_result
def get_user_data(): ''' 获取所需写入数据库的数据 时间,originIP,流量开始时间,流量最后出现时间,上行流量,下行流量 ''' file_path_all=get_file('conn') #file_path_all='test_file/conn.gz' if not os.path.exists(file_path_all): return pd.DataFrame() df=read_file.pandas_normal_gz(file_path_all) df=db_config.filter_ip_df(df,2) df=df.iloc[:-1,[0,2,9,10]]#ts,origIP,origByte,respByte df=df.dropna(how='any') grouped=df.groupby(2) user_list=[] for gp in grouped.groups: gp_df=grouped.get_group(gp).iloc[:,[0,2,3]] min_ts=gp_df.iloc[:,0].min() max_ts=gp_df.iloc[:,0].max() outgoing=gp_df.iloc[:,1].sum() incoming=gp_df.iloc[:,2].sum() gpu=traffUser(gp,min_ts,max_ts,incoming,outgoing) user_list.append(gpu) return user_list