def evaluateSpecificDataSet(eng): # evaluate data set dRectWnd = 2.0 dSMWnd = 2.0 dSCWnd = 0.15 strCoder = ecc.CODER_RS m = 4 n = 2**m-1 k = 1 r = (n-k)/2 nInterleaving = 25 print "%s: n=%d, k=%d, m=%d, r=%d, interleave=%d" % \ (strCoder, n, k, m, r, nInterleaving) strWorkingDir = "../../data/evaluation/selected_set/" lsFilePath = cf.getFileList(strWorkingDir, None) strKeyOutput = "keys" with open(strWorkingDir+strKeyOutput, 'w+') as hFile: srMean, srStd, dfDetailed = evaluateDataSet('selected', strWorkingDir, lsFilePath, dRectWnd, dSMWnd, dSCWnd, eng, strCoder, n, k, m, r, nInterleaving, bOutputData=False, bReconciliation=False, hKeyOutput=hFile) return srMean, srStd, dfDetailed
def evaluateMutualInformation(): """ scatter plot & mutual information """ strWorkingDir = "../../data/evaluation/mutual_info/" lsFilePath = cf.getFileList(strWorkingDir, None) lsOutputData = [] srMean, srStd, dfDetailed = evaluateDataSet('selected', strWorkingDir, lsFilePath, bSourceEncoding = True, bReconciliation = False, bOutputData=True, lsOutputData=lsOutputData) dfData = pd.concat(lsOutputData, axis=0) return dfData, lsOutputData
def evaluateGesture(eng): """ This function evaluates the performance of different gestures. """ # parameters dRectWnd = 2.0 dSMWnd = 2.0 dSCWnd = 0.1 strCoder = ecc.CODER_GOLAY m = 1 n = 23 k = 12 r = 3 nInterleaving = 25 print "%s: n=%d, k=%d, m=%d, r=%d, interleave=%d" % \ (strCoder, n, k, m, r, nInterleaving) lsResult = [] strWorkingDir = "../../data/evaluation/gesture/" for strLabel in ['g1', 'g2', 'g3']: strFileNamePattern= strLabel lsFilePath = cf.getFileList(strWorkingDir, strFileNamePattern) if (len(lsFilePath) != 0 ): srMean, srStd, dfDetailed = evaluateDataSet(strLabel, strWorkingDir, lsFilePath, dRectWnd, dSMWnd, dSCWnd, eng, strCoder, n, k, m, r, nInterleaving) lsResult.append(srMean) # print out rotten apples dfSelected = dfDetailed[dfDetailed[sd.BER_USER_SRC]>=0.1] if(dfSelected.shape[0] != 0): print "--records with high BER--" print dfSelected[\ [sd.FILE_NAME, sd.BER_USER_SRC, sd.BER_USER_EC]] print "----\n" dfSummary = pd.concat(lsResult, axis=1) return dfSummary
def evaluateDistance(eng): """ This function evaluate the effect of distance btw A and B """ # parameters dRectWnd = 2.0 dSMWnd = 2.0 dSCWnd = 0.1 strCoder = ecc.CODER_GOLAY m = 1 n = 23 k = 12 r = int(math.floor((n-k)/2.0) ) nInterleaving = 25 print "%s: n=%d, k=%d, m=%d, r=%d, interleave=%d" % \ (strCoder, n, k, m, r, nInterleaving) lsResult = [] strWorkingDir = "../../data/evaluation/distance/" for strLabel in ['d1', 'd2', 'd3']: strFileNamePattern= strLabel lsFilePath = cf.getFileList(strWorkingDir, strFileNamePattern) srMean, srStd, dfDetailed = evaluateDataSet(strLabel, strWorkingDir, lsFilePath, dRectWnd, dSMWnd, dSCWnd, eng, strCoder, n, k, m, r, nInterleaving) lsResult.append(srMean) # print out rotten apples dfSelected = dfDetailed[dfDetailed[sd.BER_USER_SRC]>=0.1] if(dfSelected.shape[0] != 0): print "--records with high BER--" print dfSelected[\ [sd.FILE_NAME, sd.BER_USER_SRC, sd.BER_USER_EC]] print "----\n" dfSummary = pd.concat(lsResult, axis=1) return dfSummary
def evaluateShapeCodingParams(eng): """ This function evaluate the parameter of shape coding """ # select data strWorkingDir = "../../data/evaluation/BER/" strFileNamePattern= None lsFilePath = cf.getFileList(strWorkingDir, strFileNamePattern) # params lsSCWnd = np.arange(0.05, 0.3, 0.05) dRectWnd = 2.0 dSMWnd = 2.0 strCoder = ecc.CODER_GOLAY m = 1 n = 23 k = 12 r = 2 nInterleaving = 25 print "%s: n=%d, k=%d, m=%d, r=%d, interleave=%d" % \ (strCoder, n, k, m, r, nInterleaving) # test lsResult = [] for dCodingWnd in lsSCWnd: print "evalauting SCWnd=%.2f..." % dCodingWnd for fn in lsFilePath: lsDataResult = sd.evaluateSingleData(strWorkingDir, fn, dRectDuration = dRectWnd, dSMDurction = dSMWnd, dSCDuration = dCodingWnd, eng=eng, strCoder=strCoder, n=n, k=k, m=m, r=r, nInterleaving=nInterleaving) lsResult.extend(lsDataResult) dfResult = pd.DataFrame(lsResult) gp = dfResult.groupby(dfResult[sd.WND_SC]) dfMean = gp.mean() return dfMean, dfResult
def validateOnSH(strInPath, strOutPath, bSerialize=False): ''' this function validates user diversity on Shanghai data set param: strInPath - path for separate files of top users strOutPath - path to serialize model Note: we need another script to distributes records of top users into separate files, and this function will only read separate files from strInPath ''' # find xdr lsXDR = common_function.getFileList(strInPath, "out") dcVariableImportance = {} # variable importance of each personal model dcModels = {} # dict of personal models for xdr in lsXDR: # load data print("processing %s..." % xdr) dfData = pd.read_csv(xdr, sep='|', \ names= ['BEGIN_TIME','BEGIN_TIME_MSEL','MSISDN','IMSI','SERVER_IP',\ 'SERVER_PORT','APN','PROT_CATEGORY','PROT_TYPE','LAC','SAC',\ 'CI','IMEI','RAT','HOST','STREAMING_URL','STREAMING_FILESIZE',\ 'STREAMING_DW_PACKETS','STREAMING_DOWNLOAD_DELAY','ASSOCIATED_ID',\ 'L4_UL_THROUGHPUT','L4_DW_THROUGHPUT', 'use_less'] ) del dfData['use_less'] dfData['DOWNLOAD_RATIO'] = dfData.iloc[:,17]*1.0/dfData.iloc[:,16] strIMSI = xdr.split('/')[-1].split('.')[0] # prepare data set mtX, arrY, lsTrainingFeatureNames = preprocessDataSet(dfData, g_lsSelectedColumns, \ g_lsNumericColumns, \ g_lsCategoricalColumns,\ 'DOWNLOAD_RATIO') # # train model # model = trainModel(mtX, arrY, g_modelParams) # dcVariableImportance[strIMSI] = getVariableImportance(model, lsTrainingFeatureNames) # # # test # mse = mean_squared_error(arrY, model.predict(mtX) ) # mae = mean_absolute_error(arrY, model.predict(mtX) ) # print("MSE: %.4f, MAE: %.4f" % (mse, mae) ) # cross validation dcPersonalModels = crossValidate(mtX, arrY, g_modelParams, 10) bestModel, fBestScore = max(dcPersonalModels.iteritems(), key=operator.itemgetter(1) ) dcVariableImportance[strIMSI] = getVariableImportance(bestModel, lsTrainingFeatureNames) dcModels[strIMSI] = (fBestScore, bestModel) print("model:%s, #record=%d, best=%0.2f, mean=%.2f, std=%0.2f. \n)" % \ (strIMSI, len(arrY), fBestScore, np.mean(dcPersonalModels.values() ), np.std(dcPersonalModels.values() ) ) ) dfVariableImportance = pd.DataFrame(dcVariableImportance).T # serialize models if(bSerialize is True): common_function.serialize2File(strOutPath+'serDcModels.out', dcPersonalModels) dfVariableImportance.to_csv(strOutPath+'dfVariableImportance_all.out') return dcModels, dfVariableImportance
def evaluateReconciliationParams(eng): """ Evaluate the effect of reconciliation parameters, i.e., m, k, on system performance Paramters: --------- lsFileList: list of data files eng: instance of matlab engine strCoder: name of coder lsM: the possible values of m lsR: the possible values of r Returns: a pandas.DataFrame consisted of all performance result """ # select data strWorkingDir = "../../data/evaluation/reconciliation/" strFileNamePattern= None lsFilePath = cf.getFileList(strWorkingDir, strFileNamePattern) # parameter strCoder = ecc.CODER_RS lsM = [4,] lsR = range(1, 8) dRectWnd = 2.0 dSMWnd = 2.0 dSCWnd = 0.15 # evaluate lsResult = [] if (strCoder == ecc.CODER_RS): for m in lsM: for r in lsR: n = 2**m - 1 k = n - 2*r if(k<1 or n*m>=500): break print "testing m=%d, r=%d..." % (m, r) for fn in lsFilePath: lsDataResult = sd.evaluateSingleData(strWorkingDir, fn, dRectDuration=dRectWnd, dSMDuration=dSMWnd, dSCDuration=dSCWnd, eng=eng, strCoder=strCoder, n=n, k=k, m=m, r=r) lsResult.extend(lsDataResult) elif strCoder == ecc.CODER_GOLAY: n = 23 k = 12 m = 1 r = 2 for fn in lsFilePath: lsDataResult = sd.evaluateSingleData(strWorkingDir, fn, dRectDuration=dRectWnd, dSMDuration=dSMWnd, dSCDuration=dSCWnd, eng=eng, strCoder=strCoder, n=n, k=k, m=m, r=r) lsResult.extend(lsDataResult) # result dfResult = pd.DataFrame(lsResult) dcMatchingRate = {} for r in lsR: nMatchedKey = (dfResult[sd.ERR_USER_EC][ (dfResult[sd.R]==r) & \ (dfResult[sd.ERR_USER_EC]==0) ]).count() nTotalKey = dfResult[sd.ERR_USER_EC][dfResult[sd.R]==r].count() dMatchingRate = nMatchedKey * 1.0 / nTotalKey dcMatchingRate[r] = dMatchingRate srMatchingRate = pd.Series(dcMatchingRate) return dfSummary, dfResult, srMatchingRate