Exemplo n.º 1
0
def main():
    if len(sys.argv) < 2:
        print("Usage: {} <filename.xml>".format(sys.argv[0]))
        exit(1)
    dfxml.read_dfxml(xmlfile=open(sys.argv[1],"rb"), callback=process)
    timeline.sort()
    for record in timeline:
        print("\t".join( map(str, record)) )
Exemplo n.º 2
0
def main():
    filemetadata_out = open("filemetadata.sql", "w")
    md5_out = open("md5.sql", "w")

    (conn, cursor) = differ_library.db_conn_from_config_path(args.config)

    #Get slice hash
    cursor.execute("SELECT slicehash FROM diskprint.storage WHERE location = %s", (args.slice_path,))
    inrows = [row for row in cursor]
    if len(inrows) != 1:
        logging.error("Could not find diskprint from tarball path: %r." % args.slice_path)
        sys.exit(1)
    slicehash = inrows[0]["slicehash"]

    def process_fi(fi):
        """
        Produce SQL records for every allocated file.
        (This is an inline function so the value of 'slicehash' is in scope.)
        """
        #Only allocated, regular files
        if not fi.allocated():
            return
        if fi.name_type() != "r":
            return

        #Build SQL templates
        md5_insert_template = "insert into diskprint.MD5 values ('%(keyhash)s','%(keyhash_md5)s');\n"
        filemetadata_insert_template = "insert into diskprint.filemetadata (keyhash, slicehash, path, filename, extension, bytes, mtime, ctime) values ('%(keyhash)s','%(slicehash)s','%(path)s','%(filename)s','%(extension)s',%(bytes)d,'%(mtime)s','%(ctime)s');\n"

        #Build SQL values as substitution dictionary
        d = dict()
        d["keyhash"] = fi.sha1()
        d["keyhash_md5"] = fi.md5()
        d["slicehash"] = slicehash
        d["path"] = fi.filename()
        d["filename"] = os.path.basename(fi.filename())
        d["extension"] = os.path.splitext(fi.filename())[1]
        d["bytes"] = fi.filesize()
        d["mtime"] = fi.mtime()
        d["ctime"] = fi.crtime() #TODO What does this table actually mean by ctime?  Change, or create?

        #Output
        filemetadata_out.write(filemetadata_insert_template % d)
        md5_out.write(md5_insert_template % d)

    #Begin loop through XML
    dfxml.read_dfxml(xmlfile=open(args.fiwalk_xml, "rb"), callback=process_fi)
Exemplo n.º 3
0
                           max(data)))


if __name__ == "__main__":
    from argparse import ArgumentParser
    from copy import deepcopy

    parser = ArgumentParser(
        description='Report information about a DFXML file')
    parser.add_argument('xmlfiles', help='XML files to process', nargs='+')
    parser.add_argument(
        "--files",
        help="Report on file objects that the DFXML file contains",
        action='store_true')
    parser.add_argument(
        "--imagefile",
        help="specifies imagefile to examine; automatically runs fiwalk",
        nargs='+')

    args = parser.parse_args()
    ds = DiskSet()

    if args.files:
        dfxml.read_dfxml(xmlfile=open(fn, 'rb'), callback=ds.pass1)
        if ds.uniques() > 0:
            ds.print_dups_report()
        exit(0)

    for fn in args.xmlfiles:
        dfxml_info(fn)
Exemplo n.º 4
0
                      action="store_true")
    parser.add_option("--blocksize",
                      help="specify sector blocksize",
                      default=512)
    (options, args) = parser.parse_args()

    if len(args) < 1:
        parser.print_help()
        sys.exit(1)
    fn = args[0]

    print(args)
    print("Processing %s" % fn)
    print("Searching for %s" % ", ".join(args[1:]))

    divisor = 1
    if options.offset:
        divisor = options.blocksize

    sectors = set([int(s) / divisor for s in args[1:]])

    def process(fi):
        for s in sectors:
            if fi.has_sector(s):
                print("%d\t%s" % (s, fi.filename()))

    if not fn.endswith(".xml"):
        print("iblkfind requires an XML file")
        exit(1)
    dfxml.read_dfxml(xmlfile=open(args[0], "rb"), callback=process)
Exemplo n.º 5
0
if (__name__ == "__main__"):
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.usage =\
    """
    dfxml_xtract.py  [args] dfxml imagefile
    
    Using the metadata from DFXML file, extract each file object
    from a target image file.
    """
    parser.add_argument("dfxml", help="Specify the target DFXML file")
    parser.add_argument("image", help="Specify the target image file")
    args = parser.parse_args()

    # Set up output directory for extracted files
    dfxml_path = os.path.abspath(args.dfxml)
    image_path = os.path.abspath(args.image)
    dir_name = os.getcwd() + "/output"
    if os.path.isdir(dir_name):
        os.chdir(dir_name)
    else:
        os.mkdir('output')
        os.chdir('output')

    print("Working Dir :", os.getcwd())
    print("Target DFXML:", dfxml_path)
    print("Target IMAGE:", image_path)

    dfxml.read_dfxml(xmlfile=open(dfxml_path, 'rb'), callback=extract_file)
Exemplo n.º 6
0
# Demo program that shows how to calculate the average size of file objects in a DFXML file
#

import math
import sys
import os
import collections

sys.path.append( os.path.join(os.path.dirname(__file__), ".."))
import dfxml

sums = collections.Counter()
sum_of_squares= collections.Counter()
count = collections.Counter()

def func(fi):
    ext = fi.ext()
    count[ext]  += 1
    sums[ext] += fi.filesize()
    sum_of_squares[ext] = fi.filesize() ** 2

dfxml.read_dfxml(xmlfile=open(sys.argv[1],'rb'),callback=func)
fmt = "{:8}    {:8} {:8} {:8} {:8}"
print(fmt.format("Ext","Count","Total","Average","StdDev"))
for ext in sums.keys():
    print(fmt.format(ext,
                     count[ext],
                     sums[ext],
                     sums[ext]/count[ext],
                     math.sqrt(sum_of_squares[ext]/count[ext] - (sums[ext]/count[ext])**2)))
Exemplo n.º 7
0
        f_tmp_filename.close()

        self.hasdb = {} # to clear the memory
        print ("singletons {}, pairs {}, triples {}, others{}".format(self.singletons, self.pairs, self.triples, self.others))


if __name__=="__main__":
    import argparse,os,sys
    sys.path.append(os.getenv("DOMEX_HOME") + "/src/lib/") # add the library
    sys.path.append(os.getenv("DOMEX_HOME") + "/src/dfxml/python/")                             # add the library
    import dfxml,subprocess

    parser = argparse.ArgumentParser(description="A program that takes a directory of files, computes sector-based statistics.")
    parser.add_argument("--dir", help="Directory of files that will be analyzed")
    parser.add_argument("--bs", help="Specifies a block size for piecewise hashing", type=int, default=512)
    parser.add_argument("--file", help="Specifies a file that contains output from md5deep", type=str, default=None)
    args = parser.parse_args()

    sc = SectorCorrelator()
    if args.file is None:
        p = subprocess.Popen(['md5deep','-dp '+ str(args.bs), '-r', args.dir],stdout=subprocess.PIPE)
        dfxml.read_dfxml(xmlfile=p.stdout,callback=sc.process)
    else:
        dfxml.read_dfxml(xmlfile=open(args.file,'rb'),callback=sc.process)

    sc.print_report()
    sc.gen_file_stats()
    sc.print_file_report()
    sc.gen_filetype_stats()
    sc.print_filetype_report()
Exemplo n.º 8
0
            list)  #  key is the MD5 code, value is a list of matches
        self.files = 0
        self.sectors = 0

    def process(self, fi):
        """ Process the <fileobject> objects as they are read from the XML file"""
        self.files += 1
        print(fi.filename())
        for br in fi.byte_runs():
            self.sectors += 1
            self.hashdb[br.hashdigest['md5']].append(
                (fi.filename(), br.file_offset))

    def print_report(self):
        print("Files processed: {}".format(self.files))
        print("Sectors processed: {}".format(self.sectors))
        print("")
        print("The following duplicates were found:")
        print("Hash   Filename           Offset in file")
        for (hash, ents) in self.hashdb.items():
            if len(ents) > 1:
                print("{}  -- {} copies found".format(hash, len(ents)))
                for e in sorted(ents):
                    print("  {}  {:8,}".format(e[0], e[1]))
                print("")


sc = SectorCorrelator()
dfxml.read_dfxml(xmlfile=open(sys.argv[1], 'rb'), callback=sc.process)
sc.print_report()
Exemplo n.º 9
0
 def process(self, fname):
     self.fname = fname
     dfxml.read_dfxml(xmlfile=open(fname, "rb"), callback=self.process_fi)
Exemplo n.º 10
0
if __name__=="__main__":
    from argparse import ArgumentParser
    global options

    parser = ArgumentParser()
    parser.add_argument("dfxml",type=str)
    parser.add_argument("--verbose",action="store_true")
    parser.add_argument("--prefix",type=str,help="Only output files with the given prefix")
    parser.add_argument("--distinct",action='store_true',help='Report the distinct files')
    parser.add_argument("--dups",action='store_true',help='Report the files that are dups, and give dup count')
    args = parser.parse_args()

    dobj = dedup()

    try:
        dfxml.read_dfxml(open(args.dfxml,'rb'),callback=dobj.process)
    except xml.parsers.expat.ExpatError:
        pass

    print("Total files: {:,}  total MD5s processed: {:,}  Unique MD5s: {:,}".format(dobj.files,dobj.md5s,len(dobj.seen)))

    if args.distinct:
        def report_distinct(names):
            if args.prefix and not names[0].startswith(args.prefix): return
            print("distinct: ",names[0])
        dobj.report(lambda names:len(names)==1,report_distinct)

    if args.dups:
        def report_dups(names):
            for name in names:
                if not args.prefix or name.startswith(args.prefix):
Exemplo n.º 11
0
    parser.usage = '%prog [options] imagefile-or-xmlfile s1 [s2 s3 s3 ...]'
    parser.add_option("--offset",help="values are byte offsets, not sectors",action="store_true")
    parser.add_option("--blocksize",help="specify sector blockszie",default=512)
    (options,args) = parser.parse_args()

    if len(args)<1:
        parser.print_help()
        sys.exit(1)
    fn = args[0]

    print(args)
    print("Processing %s" % fn)
    print("Searching for %s" % ", ".join(args[1:]))

    divisor = 1
    if options.offset:
        divisor = options.blocksize

    sectors = set([int(s)/divisor for s in args[1:]])


    def process(fi):
        for s in sectors:
            if fi.has_sector(s):
                print("%d\t%s" % (s,fi.filename()))
    
    if not fn.endswith(".xml"):
        print("iblkfind requires an XML file")
        exit(1)
    dfxml.read_dfxml(xmlfile=open(args[0]),callback=process)
Exemplo n.º 12
0
 def read(self,f):
     if type(f)==str:
         self.fname = f
         f = open(f,'rb')
     dfxml.read_dfxml(xmlfile=f,callback=self.pass1)
Exemplo n.º 13
0

class SectorCorrelator:
    def __init__(self):
        self.hashdb = collections.defaultdict(list) #  key is the MD5 code, value is a list of matches
        self.files = 0
        self.sectors = 0
    def process(self,fi):
        """ Process the <fileobject> objects as they are read from the XML file"""
        self.files += 1
        print(fi.filename())
        for br in fi.byte_runs():
            self.sectors += 1
            self.hashdb[br.hashdigest['md5']].append((fi.filename(),br.file_offset))
    def print_report(self):
        print("Files processed: {}".format(self.files))
        print("Sectors processed: {}".format(self.sectors))
        print("")
        print("The following duplicates were found:")
        print("Hash   Filename           Offset in file")
        for (hash,ents) in self.hashdb.items():
            if len(ents)>1:
                print("{}  -- {} copies found".format(hash,len(ents)))
                for e in sorted(ents):
                    print("  {}  {:8,}".format(e[0],e[1]))
                print("")

sc = SectorCorrelator()
dfxml.read_dfxml(xmlfile=open(sys.argv[1],'rb'),callback=sc.process)
sc.print_report()
Exemplo n.º 14
0
        if fi.is_file():
            self.fi_by_md5.setdefault(fi.md5(),[]).append(fi)

    def print_dups_report(self):
        print("Duplicates:")
        # First extract the dups, then sort them
        dups  = filter(lambda a:len(a[1])>1,self.fi_by_md5.items(),)
        dup_bytes = 0
        for (md5hash,fis) in sorted(dups,key=lambda a:a[1][0].filesize(),reverse=True):
            for fi in fis:
                print("{:>16,} {:32} {}".format(fi.filesize(),fi.md5(),fi.filename()))
            print()
            dup_bytes += fis[0].filesize() * (len(fis)-1)
        print("Total duplicate bytes: {:,}".format(dup_bytes))


if __name__=="__main__":
    from argparse import ArgumentParser
    from copy import deepcopy

    parser = ArgumentParser(description='Report information about a DFXML file')
    parser.add_argument('xmlfiles',help='XML files to process',nargs='+')
    parser.add_argument("--imagefile",help="specifies imagefile to examine; automatically runs fiwalk",nargs='+')

    args = parser.parse_args()
    ds   = DiskSet()
    for fn in args.xmlfiles:
        print("Processing {}".format(fn))
        dfxml.read_dfxml(xmlfile=open(fn,'rb'),callback=ds.pass1)
    ds.print_dups_report()
Exemplo n.º 15
0
#!/usr/bin/env python3.2

#
# Demo program that shows how to calculate the average size of file objects in a DFXML file
#

import dfxml,math,sys
import collections

sums = collections.Counter()
sum_of_squares= collections.Counter()
count = collections.Counter()

def func(fi):
    ext = fi.ext()
    count[ext]  += 1
    sums[ext] += fi.filesize()
    sum_of_squares[ext] = fi.filesize() ** 2

dfxml.read_dfxml(xmlfile=open(sys.argv[1],'rb'),callback=func)
fmt = "{:8}    {:8} {:8} {:8} {:8}"
print(fmt.format("Ext","Count","Total","Average","StdDev"))
for ext in sums.keys():
    print(fmt.format(ext,
                     count[ext],
                     sums[ext],
                     sums[ext]/count[ext],
                     math.sqrt(sum_of_squares[ext]/count[ext] - (sums[ext]/count[ext])**2)))
Exemplo n.º 16
0
 def process(self, fname):
     self.fname = fname
     dfxml.read_dfxml(xmlfile=open(fname, 'rb'), callback=self.process_fi)
Exemplo n.º 17
0
 hs_name = hs_name + '.csv'
 print("\n>>> Writing HashSet to:", hs_name)
 
 csv_out = open(hs_name, 'w', newline='')
 writer = csv.writer(csv_out)
 # Write out HashSet header
 writer.writerow(["%%%% HASHDEEP-1.0"])
 if args.sha1:
     writer.writerow(["%%%% size","md5","sha1","filename"])
 else:
     writer.writerow(["%%%% size","md5","filename"])
 writer.writerow(["## HashSet created from " + args.dfxml])
 writer.writerow(["## HashSet created using dfxml2hashdeep.py"])
 
 # Set up lists for HashSet values
 size = []
 md5 = []
 filename = []
 if args.sha1: sha1 = []
 
 # Read DFXML file for metadata values
 dfxml.read_dfxml(xmlfile=open(args.dfxml,'rb'),callback=process_file)
 
 # Write out the lists to the HashSet
 if args.sha1:
     for row in zip(size,md5,sha1,filename):
         writer.writerow(row)
 else:
     for row in zip(size,md5,filename):
         writer.writerow(row)
 csv_out.close()
Exemplo n.º 18
0
if(__name__=="__main__"):
    from argparse import ArgumentParser
    
    parser = ArgumentParser()
    parser.usage =\
    """
    dfxml_xtract.py  [args] dfxml imagefile
    
    Using the metadata from DFXML file, extract each file object
    from a target image file.
    """
    parser.add_argument("dfxml",help="Specify the target DFXML file")
    parser.add_argument("image",help="Specify the target image file")
    args = parser.parse_args()
    
    # Set up output directory for extracted files
    dfxml_path = os.path.abspath(args.dfxml)
    image_path = os.path.abspath(args.image)
    dir_name = os.getcwd() + "/output"
    if os.path.isdir(dir_name):
        os.chdir(dir_name)
    else:
        os.mkdir('output')
        os.chdir('output')
    
    print("Working Dir :", os.getcwd())
    print("Target DFXML:", dfxml_path)
    print("Target IMAGE:", image_path)

    dfxml.read_dfxml(xmlfile=open(dfxml_path,'rb'),callback=extract_file)
Exemplo n.º 19
0
 def read(self, f):
     if type(f) == str:
         self.fname = f
         f = open(f, 'rb')
     dfxml.read_dfxml(xmlfile=f, callback=self.pass1)
Exemplo n.º 20
0
    parser.add_argument("--prefix",
                        type=str,
                        help="Only output files with the given prefix")
    parser.add_argument("--distinct",
                        action='store_true',
                        help='Report the distinct files')
    parser.add_argument(
        "--dups",
        action='store_true',
        help='Report the files that are dups, and give dup count')
    args = parser.parse_args()

    dobj = dedup()

    try:
        dfxml.read_dfxml(open(args.dfxml, 'rb'), callback=dobj.process)
    except xml.parsers.expat.ExpatError:
        pass

    print("Total files: {:,}  total MD5s processed: {:,}  Unique MD5s: {:,}".
          format(dobj.files, dobj.md5s, len(dobj.seen)))

    if args.distinct:

        def report_distinct(names):
            if args.prefix and not names[0].startswith(args.prefix): return
            print("distinct: ", names[0])

        dobj.report(lambda names: len(names) == 1, report_distinct)

    if args.dups:
Exemplo n.º 21
0
# This software was developed at the National Institute of Standards
# and Technology by employees of the Federal Government in the course
# of their official duties. Pursuant to title 17 Section 105 of the
# United States Code this software is not subject to copyright
# protection and is in the public domain. NIST assumes no
# responsibility whatsoever for its use by other parties, and makes
# no guarantees, expressed or implied, about its quality,
# reliability, or any other characteristic.
#
# We would appreciate acknowledgement if the software is used.
"""
This script confirms that the DFXML pip-managed packaging exposes the dfxml package and the objects.py module.
"""

import sys

import dfxml
import dfxml.objects


def nop(x):
    pass


with open(sys.argv[1], "rb") as fh:
    dfxml.read_dfxml(fh, callback=nop)

for (event, obj) in dfxml.objects.iterparse(sys.argv[1]):
    pass