def main(): parser = argparse.ArgumentParser( description= 'Extracting features as the input of LIBSVM from log. Output = logfile.instance' ) parser.add_argument('harfolder', type=str, help='') parser.add_argument('predictfile', type=str, help='') args = parser.parse_args() harfolder = args.harfolder predicted_file = args.predictfile # Ground truth (all_real_pages, all_objects) = har.parse_pages_har(harfolder) valid_urls = [i.root.url for i in all_real_pages] # Reset nodes. One whole day wasted...Shit!@@ for node in all_objects: node.bpointer = None node.fpointer = [] # detected pages with SVM (recut_pos_pages, recut_tp_pages) = get_svm_pages(all_objects, valid_urls, predicted_file) timetuple = pagetime(all_real_pages, recut_tp_pages) # page timings pagetimings = [ '{0} {1}'.format(i[0], i[1]) for i in timetuple if i[0] > 0 and i[1] > 0 ] ofile = open('m_pagetime.txt', 'wb') ofile.write('\n'.join(pagetimings)) ofile.close()
def main(): parser = argparse.ArgumentParser(description='Extracting features as the input of LIBSVM from log. Output = logfile.instance') parser.add_argument('harfolder', type=str, help= '') parser.add_argument('predictfile', type=str, help= '') args = parser.parse_args() harfolder = args.harfolder predicted_file = args.predictfile # Ground truth (all_real_pages, all_objects) = har.parse_pages_har(harfolder) valid_urls = [i.root.url for i in all_real_pages] # Reset nodes. One whole day wasted...Shit!@@ for node in all_objects: node.bpointer = None node.fpointer = [] # detected pages with SVM (recut_pos_pages, recut_tp_pages) = get_svm_pages(all_objects, valid_urls, predicted_file) timetuple = pagetime(all_real_pages, recut_tp_pages) # page timings pagetimings = ['{0} {1}'.format(i[0], i[1]) for i in timetuple if i[0]>0 and i[1]>0] ofile = open('m_pagetime.txt', 'wb') ofile.write('\n'.join(pagetimings)) ofile.close()
def main(): parser = argparse.ArgumentParser(description='This program extracts real page \ urls of HAR files as groundtruth.') parser.add_argument('harfolder', type=str, help= 'File folder containing HAR \ file(s). All the HAR files under \ this folder will be processed.') parser.add_argument('output', type=str, help= 'Output data.') args = parser.parse_args() harfolder = args.harfolder dumpfile = args.output (all_pages, all_objs) = HAR.parse_pages_har(harfolder) # Write to file if os.path.exists(dumpfile): os.remove(dumpfile) ofile = open(dumpfile, 'wb') i = 0 # counter for p in all_pages: if p.root: i += 1 ofile.write("{0}\t{1}\n".format(p.root.url, p.root.start_time)) print('write {0} real pages to: {1}'.format(i, dumpfile)) ofile.flush() ofile.close()
def main(): parser = argparse.ArgumentParser( description= 'Extracting features as the input of LIBSVM from log. Output = logfile.instance' ) parser.add_argument('harfolder', type=str, help='') args = parser.parse_args() harfolder = args.harfolder # Ground truth (all_real_pages, all_objects) = HAR.parse_pages_har(harfolder) valid_pages = [i.root.identifier for i in all_real_pages] for node in all_objects: node.bpointer = None node.fpointer = [] # detected pages with SVM (pos_pages, tp_pages) = get_ss_pages(all_objects, valid_pages) # objects status print 'real pages:', len(all_real_pages) print 'detected pages:', len(pos_pages) (classified_right, classified_wrong, missed) = \ check_objects(all_real_pages, pos_pages) print 'right {0} wrong {1} missed {2}'.format(len(classified_right), len(classified_wrong), len(missed)) def which_type(obj): subtype_re = { r'.*(jpeg|jpg|gif|png|bmp|ppm|pgm|pbm|pnm|tiff|exif|cgm|svg).*': 'image', r'.*(flash|flv).*': 'flash', r'.*(css).*': 'css', r'.*(javascript|js).*': 'js', r'.*(html|htm).*': 'html', } if obj.type != None: for regex in subtype_re.keys(): if re.match(re.compile(regex, re.I), obj.type): return subtype_re[regex] else: continue return 'others' def stat(objects): html = 0 js = 0 css = 0 flash = 0 image = 0 others = 0 for obj in objects: objtype = which_type(obj) if objtype == 'html': html += 1 elif objtype == 'js': js += 1 elif objtype == 'css': css += 1 elif objtype == 'flash': flash += 1 elif objtype == 'image': image += 1 elif objtype == 'others': others += 1 print 'html {0} js {1} css {2} flash {3} image {4} others{5}'.\ format(html, js, css, flash,image,others) stat(classified_right) stat(classified_wrong) stat(missed)
def main(): parser = argparse.ArgumentParser(description='Extracting features as the input of LIBSVM from log. Output = logfile.instance') parser.add_argument('harfolder', type=str, help= '') args = parser.parse_args() harfolder = args.harfolder # Ground truth (all_real_pages, all_objects) = HAR.parse_pages_har(harfolder) valid_pages = [i.root.identifier for i in all_real_pages] for node in all_objects: node.bpointer = None node.fpointer = [] # detected pages with SVM (pos_pages, tp_pages) = get_timetype_pages(all_objects, valid_pages) # objects status print 'real pages:',len(all_real_pages) print 'detected pages:', len(pos_pages) (classified_right, classified_wrong, missed) = \ check_objects(all_real_pages, pos_pages) print 'right {0} wrong {1} missed {2}'.format(len(classified_right), len(classified_wrong), len(missed)) def which_type(obj): subtype_re = { r'.*(jpeg|jpg|gif|png|bmp|ppm|pgm|pbm|pnm|tiff|exif|cgm|svg).*': 'image', r'.*(flash|flv).*': 'flash', r'.*(css).*': 'css', r'.*(javascript|js).*': 'js', r'.*(html|htm).*': 'html', } if obj.type != None: for regex in subtype_re.keys(): if re.match(re.compile(regex, re.I), obj.type): return subtype_re[regex] else: continue return 'others' def stat(objects): html = 0 js = 0 css = 0 flash = 0 image = 0 others = 0 for obj in objects: objtype = which_type(obj) if objtype == 'html': html += 1 elif objtype == 'js': js += 1 elif objtype =='css': css += 1 elif objtype == 'flash': flash += 1 elif objtype == 'image': image += 1 elif objtype == 'others': others += 1 print 'html {0} js {1} css {2} flash {3} image {4} others{5}'.\ format(html, js, css, flash,image,others) stat(classified_right) stat(classified_wrong) stat(missed)
def main(): parser = argparse.ArgumentParser(description='Extracting features as the input of LIBSVM from log. Output = logfile.instance') parser.add_argument('harfolder', type=str, help= '') parser.add_argument('predictfile', type=str, help= '') args = parser.parse_args() harfolder = args.harfolder predicted_file = args.predictfile # Ground truth (all_real_pages, all_objects) = har.parse_pages_har(harfolder) valid_urls = [i.root.url for i in all_real_pages] # Reset nodes. One whole day wasted...Shit!@@ for node in all_objects: node.bpointer = None node.fpointer = [] # Ground truth pagetime dumpfile = 'pagetime_gt.txt' ofile = open(dumpfile, 'wb') for page in all_real_pages: ofile.write(str(page.total_seconds())+'\n') ofile.close() # detected pages with SVM (recut_pos_pages, recut_tp_pages) = get_svm_pages(all_objects, valid_urls, predicted_file) # page timings # pagetimings = [str(i.total_seconds()) for i in recut_pos_pages if i.total_seconds() > 0] # ofile = open('pagetime_svm_pos.txt', 'wb') # ofile.write('\n'.join(pagetimings)) # ofile.close() # pagetimings = [str(i.total_seconds()) for i in recut_tp_pages if i.total_seconds() > 0] # ofile = open('pagetime_svm_tp.txt', 'wb') # ofile.write('\n'.join(pagetimings)) # ofile.close() # objects status (classified_right, classified_wrong, missed) = \ check_objects(all_real_pages, recut_pos_pages) print 'right {0} wrong {1} missed {2}'.format(len(classified_right), len(classified_wrong), len(missed)) def which_type(obj): subtype_re = { r'.*(jpeg|jpg|gif|png|bmp|ppm|pgm|pbm|pnm|tiff|exif|cgm|svg).*': 'image', r'.*(flash|flv).*': 'flash', r'.*(css).*': 'css', r'.*(javascript|js).*': 'js', r'.*(html|htm).*': 'html', } if obj.type != None: for regex in subtype_re.keys(): if re.match(re.compile(regex, re.I), obj.type): return subtype_re[regex] else: continue return 'others' def stat(objects): html = 0 js = 0 css = 0 flash = 0 image = 0 others = 0 for obj in objects: objtype = which_type(obj) if objtype == 'html': html += 1 elif objtype == 'js': js += 1 elif objtype =='css': css += 1 elif objtype == 'flash': flash += 1 elif objtype == 'image': image += 1 elif objtype == 'others': others += 1 print 'html {0} js {1} css {2} flash {3} image {4} others{5}'.\ format(html, js, css, flash,image,others) stat(classified_right) stat(classified_wrong) stat(missed)
def main(): parser = argparse.ArgumentParser( description= 'Extracting features as the input of LIBSVM from log. Output = logfile.instance' ) parser.add_argument('harfolder', type=str, help='') parser.add_argument('predictfile', type=str, help='') args = parser.parse_args() harfolder = args.harfolder predicted_file = args.predictfile # Ground truth (all_real_pages, all_objects) = har.parse_pages_har(harfolder) valid_urls = [i.root.url for i in all_real_pages] # Reset nodes. One whole day wasted...Shit!@@ for node in all_objects: node.bpointer = None node.fpointer = [] # Ground truth pagetime dumpfile = 'pagetime_gt.txt' ofile = open(dumpfile, 'wb') for page in all_real_pages: ofile.write(str(page.total_seconds()) + '\n') ofile.close() # detected pages with SVM (recut_pos_pages, recut_tp_pages) = get_svm_pages(all_objects, valid_urls, predicted_file) # page timings # pagetimings = [str(i.total_seconds()) for i in recut_pos_pages if i.total_seconds() > 0] # ofile = open('pagetime_svm_pos.txt', 'wb') # ofile.write('\n'.join(pagetimings)) # ofile.close() # pagetimings = [str(i.total_seconds()) for i in recut_tp_pages if i.total_seconds() > 0] # ofile = open('pagetime_svm_tp.txt', 'wb') # ofile.write('\n'.join(pagetimings)) # ofile.close() # objects status (classified_right, classified_wrong, missed) = \ check_objects(all_real_pages, recut_pos_pages) print 'right {0} wrong {1} missed {2}'.format(len(classified_right), len(classified_wrong), len(missed)) def which_type(obj): subtype_re = { r'.*(jpeg|jpg|gif|png|bmp|ppm|pgm|pbm|pnm|tiff|exif|cgm|svg).*': 'image', r'.*(flash|flv).*': 'flash', r'.*(css).*': 'css', r'.*(javascript|js).*': 'js', r'.*(html|htm).*': 'html', } if obj.type != None: for regex in subtype_re.keys(): if re.match(re.compile(regex, re.I), obj.type): return subtype_re[regex] else: continue return 'others' def stat(objects): html = 0 js = 0 css = 0 flash = 0 image = 0 others = 0 for obj in objects: objtype = which_type(obj) if objtype == 'html': html += 1 elif objtype == 'js': js += 1 elif objtype == 'css': css += 1 elif objtype == 'flash': flash += 1 elif objtype == 'image': image += 1 elif objtype == 'others': others += 1 print 'html {0} js {1} css {2} flash {3} image {4} others{5}'.\ format(html, js, css, flash,image,others) stat(classified_right) stat(classified_wrong) stat(missed)