示例#1
0
def _get_df_for_quic_data(coll_name):
    #iterate through the list of collections
    logger.info('converting ' + coll_name + ' to dataframe for easy analysis')
    status, error, cursor = dbif.db_collection_find_records(globals.ANALYSIS_DB_NAME, coll_name)
    #create a dataframe to hold the contents
    df = pd.DataFrame()
    count = 0
    if status == dbif.DBIF_OK:
        #read the documents one by one using the cursor
        for doc in cursor:
            df.loc[count, 'Date'] = str(doc['meta']['ts']['year']) + '-' + str(doc['meta']['ts']['month']) + '-' + str(doc['meta']['ts']['day'])

            df.loc[count, 'pkts_percentage'] = float(doc['items']['pkts_percentage'])
            df.loc[count, 'bytes_percentage'] = float(doc['items']['bytes_percentage'])
            count += 1

        #sort on the Date column
        df['Date'] = pd.to_datetime(df.Date)
        df = df.sort_values(by = 'Date')
        #the entire document set it in the dataframe now, not all documents had all the fields
        #replace NaNs with 0.0, this is because it makes sense to do this for pkts_percentage
        df = df.fillna(0.0)
        logger.info('created dataframe')
    else:
        logger.error('error while converting collection ' + coll + ' to dataframe')
    return status, df
示例#2
0
def _get_df_for_protocol_data(coll_name):
    #iterate through the list of collections
    logger.info('converting ' + coll_name + ' to dataframe for easy analysis')
    status, error, cursor = dbif.db_collection_find_records(globals.ANALYSIS_DB_NAME, coll_name)
    #create a dataframe to hold the contents
    df = pd.DataFrame()
    count = 0
    if status == dbif.DBIF_OK:
        #read the documents one by one using the cursor
        for doc in cursor:
            nan_or_unassigned = 0.0
            df.loc[count, 'Date'] = str(doc['meta']['ts']['year']) + '-' + str(doc['meta']['ts']['month']) + '-' + str(doc['meta']['ts']['day'])
            for item in doc['items']:
                proto = str(item['protocol'])        
                #special handling for nan and unassigned (also handle the typo)
                if proto == 'nan' or proto == 'unassgined' or proto == 'unassigned':
                    nan_or_unassigned += float(item['pkts_percentage'])
                    #print 'count ' + str(count) + ' proto ' + proto + 'pkts percentage ' + str(nan_or_unassigned)
                    df.loc[count, 'unknown'] = nan_or_unassigned
                else:
                    df.loc[count, proto] = float(item['pkts_percentage'])
            count += 1

        #sort on the Date column
        df['Date'] = pd.to_datetime(df.Date)
        df = df.sort_values(by = 'Date')
        #the entire document set it in the dataframe now, not all documents had all the fields
        #replace NaNs with 0.0, this is because it makes sense to do this for pkts_percentage
        df = df.fillna(0.0)
        logger.info('created dataframe')
    else:
        logger.error('error while converting collection ' + coll + ' to dataframe')
    return status, df