def getHistogramFromFile(fin, bucketSize, fout): fi = open(fin, 'r') totalFile = 0 totalSize = 0 bucket = dict() while True: line = fi.readline() if not line: break line = line.strip() if line == '': continue field = line.split() size = int(field[0]) # if size > 102400: continue #only consider file less than 100KB # if size > 0: continue bucket_id = size/bucketSize if not bucket_id in bucket: bucket[bucket_id] = 1 else: bucket[bucket_id] += 1 totalSize += size totalFile += 1 # if totalFile > 100: break fi.close() print 'Total file\t'+str(totalFile) print 'Total size\t'+humanReadable(totalSize) print 'Avg size \t'+humanReadable(totalSize * 1.0/totalFile) #write to csv file -> draw chart by Excel fo = open(fout, 'w') for key in sorted(bucket.iterkeys()): fo.write(str(key)+','+ str(bucket[key])+','+ const.humanReadable(key*bucketSize)+','+ const.humanReadable((key+1)*bucketSize)+'\n') # fo.write(str(key)+','+str(bucket[key])+','+str(key*bucketSize)+','+str((key+1)*bucketSize)+'\n') fo.close()
def getHistogram(path, bucketSize, fout): #check if path exists if not os.path.exists(const.FILESYSTEM_PATH): print "path doesn't exisit" return #dictionary of buckets bucket = dict() minSize = sys.maxint maxSize = -1 avgSize = 0 nFile = 0 #scan through all files, get their sizes, put them in bucket for dirPath, subdirs, files in os.walk(path): #by pass scan folders & meta files if dirPath == const.SCAN_DIR or dirPath == const.FILESYSTEM_PATH: continue for f in files: # print dirPath + '/' + f size = os.path.getsize(dirPath + '/' + f) minSize = min(size, minSize) maxSize = max(size, maxSize) avgSize += size nFile += 1 bucket_id = size/bucketSize if not bucket_id in bucket: bucket[bucket_id] = 1 else: bucket[bucket_id] += 1 #write to csv file -> draw chart by Excel fo = open(fout, 'w') for key in sorted(bucket.iterkeys()): fo.write(str(key)+','+ str(bucket[key])+','+ const.humanReadable(key*bucketSize)+','+ const.humanReadable((key+1)*bucketSize)+'\n') # fo.write(str(key)+','+str(bucket[key])+','+str(key*bucketSize)+','+str((key+1)*bucketSize)+'\n') fo.close() #write statistics print 'min size:\t', const.humanReadable(minSize) print 'avg size:\t', const.humanReadable(float(avgSize)/nFile) print 'max size:\t', const.humanReadable(maxSize) print 'total size:\t', const.humanReadable(avgSize)
def createFileSystem(filesystemPath, total, sizeDist, repProb, nRepDist, mutProb, nMutDist, mutantLevel, quota = None, logFile = None): global fsPath fsPath = filesystemPath if fsPath[-1] != '/': fsPath = fsPath + '/' #filesystem path must end with '/' start_time = time.time() #recreate file system root os.system('rm -rf '+fsPath) #be careful w/ this command if not os.path.exists(fsPath): os.makedirs(fsPath) #create folders. estimated 5-84 files per folder #todo: need better things nDir = random.randint(total/100, total/5)+1 #+1 de tranh truong hop tra lai 0. for i in range(nDir): dirPath = fsPath + str(i) if not os.path.exists(dirPath): os.makedirs(dirPath) #open log files fu = open(fsPath + const.FS_META_UNIQUE, 'w') fm = open(fsPath + const.FS_META_MUTANT, 'w') fr = open(fsPath + const.FS_META_REPLIC, 'w') hFile = fsPath + const.FS_META_HISTOGRAM #add file to file system #todo: recheck this x*(1+repProp.e+mutProb.e) = 1 voi x = probability of unique files in the file system # totalUnique = int(round(total/(1+repProb*expect(nRepDist)+mutProb*expect(nMutDist)))) uniProb = 1 - repProb - mutProb nUnique = int(round(uniProb * total)) totalUnique = 0 totalMutant = 0 totalRep = 0 totalSize = 0 count = dict() #book-keeping: number of files generated by each distribution #print out input params log([hFile, logFile], 'Creating_FS', True) majorDistPercent = 0 majorDist = -1 for k,v in sizeDist.iteritems(): if v > majorDistPercent: majorDistPercent = v majorDist = k.name inputs = '\tmajorDist:' + str(majorDist) + '\tdistName:' + str(const.DIST_NAME[majorDist]) + '\n' inputs += '\tdupRate:' + str(repProb+mutProb) + '\n' inputs += '\tfsPath:' + str(filesystemPath) + '\n' inputs += '\ttotal:' + str(total) + '\n' inputs += '\tsizeDist:\n' for k, v in sizeDist.iteritems(): inputs += '\t\t%:'+str(v)+'\tdist:'+ const.DIST_NAME[int(k.name)] + '\tparams:' + str(k.params) + '\tmean_size:' + humanReadable(expect(k)) + '\n' inputs += '\trepProb:'+ str(repProb) + '\n' inputs += '\tnRepDist:'+ str(nRepDist.name) + '\tparams:' + str(nRepDist.params) + '\n' inputs += '\tmutProb:'+ str(mutProb) + '\n' inputs += '\tnMutDist:'+ str(nMutDist.name) + '\tparams:' + str(nMutDist.params) + '\n' inputs += '\tmutantLevel:'+ str(mutantLevel) + '\n' inputs += '\tquota:'+ humanReadable(quota) + '\n' inputs += '\tlogFile:'+ str(logFile) + '\n' log([hFile, logFile], inputs) log(hFile, '\n\t------') #transform dist sizeDist = dict(sizeDist) #create another copy to protect the input param transformDist(sizeDist) #start creating files fh = open(fsPath + const.FS_META_HISTOGRAM, 'a') minSize = sys.maxint maxSize = 0 for i in range(nUnique): selectedDist = selectDist(sizeDist, random.random()) #generate random real number -> get the distribution #create 1 unique file fcreated = [] size = 0 while size <= 0: size = int(round(drawSize(selectedDist))) #draw file size from distribution minSize = min(minSize, size) maxSize = max(maxSize, size) fcreated.extend(createFiles(1, size, getRandomDir(nDir), i, fu)) #create 1 file, put it in random dir, return file name totalUnique += 1 #create mutant file(s) nMutant = drawNumber(mutProb/uniProb, nMutDist) #draw number of mutant. co the tra lai 0 if nMutant > 0: #create mutant(s), put it in random dir, return list of mutant fcreated.extend(createMutants(fcreated[0], i, nMutant, mutantLevel, totalMutant, nDir, fm)) totalMutant += nMutant #create rep file(s) nRep = drawNumber(repProb/uniProb, nRepDist) #draw number of replication if nRep > 0: #create rep (duplicate of unique+mutant), put it in random dir fcreated.extend(createRep(fcreated, nRep, totalRep, nDir, fr)) totalRep += nRep #update statistics info & logs # print i, totalUnique, totalMutant, totalRep, totalUnique + totalMutant + totalRep if selectedDist not in count: count[selectedDist] = 1+nMutant+nRep else: count[selectedDist] = count[selectedDist]+1+nMutant+nRep for f in fcreated: fh.write('x:\t'+f+'\t'+str(size)+'\n') totalSize += size * (1+nRep+nMutant) if quota is not None and totalSize > quota: logStr = '\tOver quota after file ' + str(totalUnique + totalMutant + totalRep) + 'th. quota = ' + humanReadable(quota) log(logFile, logStr) fh.write(logStr+'\n') break #check point if ((i+1) % 100 == 0): print '\tcreated:' + str(i+1) + '/' + str(nUnique) + '\t' + currentTime() # print 'size = %d\tnMutant = %d\tnRep = %d' %(size, nMutant, nRep) #close log files fu.close() fm.close() fr.close() fh.write('\t------\n\n') fh.close() #print statistics info totalFile = float(totalUnique + totalMutant + totalRep) for k, v in sorted(count.iteritems()): log([hFile, logFile], '\tdist:'+ const.DIST_NAME[int(k.name)] + '\tparams:' + str(k.params) + '\tfiles:' +str(v) + '\t%:' + str(round(v/totalFile, 2))) log([hFile, logFile], '\tfolders:' + str(nDir)) log([hFile, logFile], '\ttotal_files:' + str(int(totalFile))) log([hFile, logFile], '\tunique:'+ str(totalUnique) +'\t%:'+ str(round(totalUnique/totalFile, 2))) log([hFile, logFile], '\tmutant:'+ str(totalMutant) +'\t%:'+ str(round(totalMutant/totalFile, 2))) log([hFile, logFile], '\trep:'+ str(totalRep) +'\t%:'+ str(round(totalRep/totalFile, 2))) log([hFile, logFile], '\ttotal_size:' + str(const.humanReadable(totalSize))) log([hFile, logFile], '\tmin_size:' + str(const.humanReadable(minSize))) log([hFile, logFile], '\tavg_size:' + str(const.humanReadable(totalSize/totalFile))) log([hFile, logFile], '\tmax_size:' + str(const.humanReadable(maxSize))) log([hFile, logFile], '\tcreate_time:'+ str(round(time.time() - start_time, 3)) +'\tseconds', True)