def compare_features(self): out = self.out for feature_file in self.b1.feature_files(): if feature_file not in self.only_features: continue if feature_file not in self.b2.feature_files(): continue print("Compare features",feature_file) if self.both: (a,b) = self.getab() a_offsets = set([bulk_extractor_reader.parse_feature_line(line)[0] for line in a.open(feature_file) if line[0]!=35]) b_offsets = set([bulk_extractor_reader.parse_feature_line(line)[0] for line in b.open(feature_file) if line[0]!=35]) common = a_offsets.intersection(b_offsets) for line in a.open(feature_file): r = bulk_extractor_reader.parse_feature_line(line) if r and r[0] in common: print("{} {} IN BOTH".format(r[0].decode('utf-8'),r[1].decode('utf-8')), file=out) # differences for p in [1,2]: (a,b) = self.getab(p) a_features = {} for line in a.open(feature_file): r = bulk_extractor_reader.parse_feature_line(line) if not r: continue a_features[r[0]] = r[1] for line in b.open(feature_file): r = bulk_extractor_reader.parse_feature_line(line) if not r: continue if r[0] not in a_features: print("{} {} is only in {}".format(r[0].decode('utf-8'),r[1].decode('utf-8'),b.name), file=out)
def datacheck_checkreport(outdir): """Reports on whether the output in outdir matches the datacheck report""" print("opening ",outdir) b = bulk_extractor_reader.BulkReport(outdir) found_features = {} print("Feature files:",list(b.feature_files())) print("Histogram files:",list(b.histogram_files())) for fn in b.feature_files(): print("Reading feature file {}".format(fn)) for (pos,feature,context) in b.read_features(fn): found_features[pos] = feature print("Now reading features from data_check.txt") not_found = {} report_mismatches = False found_count = 0 for line in open("data_check.txt","rb"): y = bulk_extractor_reader.parse_feature_line(line) if y: (pos,feature,context) = y if pos in found_features: found_count += 1 #print("{} found".format(pos.decode('utf-8'))) if found_features[pos]!=feature and report_mismatches: if found_features[pos]!=b'<CACHED>' and feature!=b'<CACHED>': print(" {} != {}".format(feature,found_features[pos])) else: not_found[pos] = feature for pos in sorted(not_found): print("{} not found {}".format(pos,not_found[pos])) print("Total features found: {}".format(found_count)) print("Total features not found: {}".format(len(not_found)))
def datacheckreport(outdir): # Open the datadir print("opening ",outdir) b = bulk_extractor_reader.BulkReport(outdir) found_features = {} print("Feature files:",list(b.feature_files())) print("Histogram files:",list(b.histogram_files())) for fn in b.feature_files(): print("Reading feature file {}".format(fn)) for (pos,feature,context) in b.read_features(fn): found_features[pos] = feature print("Now reading features from data_check.txt") not_found = {} report_mismatches = False found_count = 0 for line in open("data_check.txt","rb"): y = bulk_extractor_reader.parse_feature_line(line) if y: (pos,feature,context) = y if pos in found_features: found_count += 1 #print("{} found".format(pos.decode('utf-8'))) if found_features[pos]!=feature and report_mismatches: if found_features[pos]!=b'<CACHED>' and feature!=b'<CACHED>': print(" {} != {}".format(feature,found_features[pos])) else: not_found[pos] = feature for pos in sorted(not_found): print("{} not found {}".format(pos,not_found[pos])) print("Total features found: {}".format(found_count)) print("Total features not found: {}".format(len(not_found)))
def process(out,dname1,dname2): mode = 'text' if options.html: mode='html' b1 = bulk_extractor_reader.BulkReport(dname1) b2 = bulk_extractor_reader.BulkReport(dname2) t = ttable.ttable() t.append_data(['bulk_diff.py Version:',bulk_diff_version]) t.append_data(['PRE Image:',b1.image_filename()]) t.append_data(['POST Image:',b2.image_filename()]) out.write(t.typeset(mode=mode)) for i in [1,2]: if i==1: a=b1;b=b2 else: b=b1;a=b2; r = a.files.difference(b.files) if r: print("Files only in {}:".format(a.name)) for f in r: if ".txt" in f: print(" %s (%d lines)" % (f,a.count_lines(f))) else: print(" %s" % (f)) # Report interesting differences based on the historgrams. # Output Example: """ # in PRE # in POST ∆ Feature 10 20 10 [email protected] 8 17 9 [email protected] 11 16 5 [email protected] """ b1_histograms = set(b1.histogram_files()) b2_histograms = set(b2.histogram_files()) common_histograms = b1_histograms.intersection(b2_histograms) if options.html: out.write("<ul>\n") for histogram_file in sorted(histogram_files): out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file,histogram_file)) out.write("</ul>\n<hr/>\n") for histogram_file in sorted(common_histograms): diffcount = 0 if options.html: out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file,histogram_file)) else: out.write('\n') t = ttable.ttable() t.set_col_alignment(0,t.RIGHT) t.set_col_alignment(1,t.RIGHT) t.set_col_alignment(2,t.RIGHT) t.set_col_alignment(3,t.LEFT) t.set_title(histogram_file) t.append_head(['# in PRE','# in POST','∆','Value']) b1.hist = b1.read_histogram(histogram_file) b2.hist = b2.read_histogram(histogram_file) b1.keys = set(b1.hist.keys()) b2.keys = set(b2.hist.keys()) # Create the output, then we will sort on col 1, 2 and 4 data = [] for feature in b1.keys.union(b2.keys): v1 = b1.hist.get(feature,0) v2 = b2.hist.get(feature,0) if v1!=v2: diffcount += 1 if v2>v1 or (v2==v1 and options.same) or (v2<v1 and options.smaller): data.append((v1, v2, v2-v1, feature.decode('utf-8'))) # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value def mysortkey(a): return (-a[2],a[3],a[1],a[0]) if data: for row in sorted(data,key=mysortkey): t.append_data(row) out.write(t.typeset(mode=mode)) if diffcount==0: if options.html: out.write("{}: No differences\n".format(histogram_file)) else: out.write("{}: No differences\n".format(histogram_file)) if options.features: for feature_file in b1.feature_files(): if feature_file not in b2.feature_files(): continue print("Compare features",feature_file) for p in [1,2]: if p==1: a = b1; b = b2 else: a = b2; b = a a_features = {} for line in a.open(feature_file): r = bulk_extractor_reader.parse_feature_line(line) if not r: continue a_features[r[0]] = r[1] for line in b.open(feature_file): r = bulk_extractor_reader.parse_feature_line(line) if not r: continue if r[0] not in a_features: print("{} {} is only in {}".format(r[0],r[1],a.name))
def analyze_outdir(outdir): """Print statistics about an output directory""" print("Analyze {}".format(outdir)) b = bulk_extractor_reader.BulkReport(outdir) print("bulk_extractor version: {}".format(b.version())) print("Image filename: {}".format(b.image_filename())) # Print which scanners were run and how long they took analyze_reportxml(b.xmldoc) hfns = list(b.histogram_files()) # histogram files print("") print("Histogram Files: {}".format(len(hfns))) def get_firstline(fn): """Returns the first line that is not a comment""" for line in b.open(fn,'rb'): if bulk_extractor_reader.is_comment_line(line): continue return line[:-1] for fn in sorted(hfns): h = b.read_histogram(fn) firstline = get_firstline(fn) if(type(firstline)==bytes and type(firstline)!=str): firstline = firstline.decode('utf-8') print(" {:>25} entries: {:>10,} (top: {})".format(fn,len(h),firstline)) fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1]) if b.feature_files(): ffns = sorted(list(b.feature_files())) features = {} print("") print("Feature Files: {}".format(len(ffns))) for fn in ffns: # feature files lines = 0 for line in b.open(fn,'r'): if not bulk_extractor_reader.is_comment_line(line): lines += 1 features[fn] = lines print(" {:>25} features: {:>12,} {}".format(fn,lines,analyze_warning(fnpart,fn,lines))) # If there is a SQLite database, analyze that too! if args.featurefile and args.featuresql: import sqlite3 conn = sqlite3.connect(os.path.join(outdir,"report.sqlite")) if conn: c = conn.cursor() c.execute("PRAGMA cache_size = 200000") print("Comparing SQLite3 database to feature files:") for fn in ffns: try: table = "f_"+fn.lower().replace(".txt","") cmd = "select count(*) from "+table print(cmd) c.execute(cmd); ct = c.fetchone()[0] print("{}: {} {}".format(fn,features[fn],ct)) # Now check them all to make sure that the all match count = 0 for line in b.open(fn,'r'): ary = bulk_extractor_reader.parse_feature_line(line) if ary: (path,feature) = ary[0:2] path = path.decode('utf-8') feature = feature.decode('utf-8') c.execute("select count(*) from "+table+" where path=? and feature_eutf8=?",(path,feature)) ct = c.fetchone()[0] if ct==1: #print("feature {} {} in table {} ({})".format(path,feature,table,ct)) pass if ct==0: #pass print("feature {} {} not in table {} ({})".format(path,feature,table,ct)) count += 1 if count>args.featuretest: break except sqlite3.OperationalError as e: print(e)
def analyze_outdir(outdir): """Print statistics about an output directory""" print("Analyze {}".format(outdir)) b = bulk_extractor_reader.BulkReport(outdir) print("bulk_extractor version: {}".format(b.version())) print("Image filename: {}".format(b.image_filename())) # Print which scanners were run and how long they took analyze_reportxml(b.xmldoc) hfns = list(b.histogram_files()) # histogram files print("") print("Histogram Files: {}".format(len(hfns))) def get_firstline(fn): """Returns the first line that is not a comment""" for line in b.open(fn,'rb'): if bulk_extractor_reader.is_comment_line(line): continue return line[:-1] for fn in sorted(hfns): h = b.read_histogram(fn) firstline = get_firstline(fn) if(type(firstline)==bytes and type(firstline)!=str): firstline = firstline.decode('utf-8') print(" {:>25} entries: {:>10,} (top: {})".format(fn,len(h),firstline)) fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1]) ffns = sorted(list(b.feature_files())) if ffns: features = {} print("") print("Feature Files: {}".format(len(ffns))) for fn in ffns: # feature files lines = 0 for line in b.open(fn,'r'): if not bulk_extractor_reader.is_comment_line(line): lines += 1 features[fn] = lines print(" {:>25} features: {:>12,} {}".format(fn,lines,analyze_warning(fnpart,fn,lines))) # If there is a SQLite database, analyze that too! if args.featurefile and args.featuresql: import sqlite3 conn = sqlite3.connect(os.path.join(outdir,"report.sqlite")) if conn: c = conn.cursor() c.execute("PRAGMA cache_size = 200000") print("Comparing SQLite3 database to feature files:") for fn in ffns: try: table = "f_"+fn.lower().replace(".txt","") cmd = "select count(*) from "+table print(cmd) c.execute(cmd); ct = c.fetchone()[0] print("{}: {} {}".format(fn,features[fn],ct)) # Now check them all to make sure that the all match count = 0 for line in b.open(fn,'r'): ary = bulk_extractor_reader.parse_feature_line(line) if ary: (path,feature) = ary[0:2] path = path.decode('utf-8') feature = feature.decode('utf-8') c.execute("select count(*) from "+table+" where path=? and feature_eutf8=?",(path,feature)) ct = c.fetchone()[0] if ct==1: #print("feature {} {} in table {} ({})".format(path,feature,table,ct)) pass if ct==0: #pass print("feature {} {} not in table {} ({})".format(path,feature,table,ct)) count += 1 if count>args.featuretest: break except sqlite3.OperationalError as e: print(e)