예제 #1
0
def duplicates_update(cnx):
    """
        Selects the *real* duplicates (= files having the same *full* hash), and marks them into the DB.

        Args:
            cnx (sqlite3.Connection): Connection object

        Returns:
            t (time): The execution time of this function        
            nb (int): The number of files having duplicates
            size (int): The size of all files that have duplicates
    """

    global last_step, last_id

    # Start time

    chrono = utils.Chrono()
    chrono.start()

    #
    # ---> All-in-one SQL query (updating the files in the selection of duplicates)
    #
    # The query has 2 parts:
    #   . The UPDATE that marks the 'has_duplicate' of all files whose hash is in the selected ones;
    #   . The SELECT used in the 'IN' clause where we select (complete) hashes present more than once.
    #

    cnx.execute("UPDATE filelist SET has_duplicate = True WHERE hash IN \
        (SELECT hash FROM filelist WHERE hash NOT NULL GROUP BY hash HAVING COUNT(hash) > 1 ORDER BY hash)"
                )

    #
    #  ---> Last commit
    #

    # Checkpoint

    last_step = "pre_duplicates_rehash"
    last_id = "all"

    utils.checkpoint_db(cnx, "duplicates_update", "all", commit=True)

    # Results...

    nb, size = duplicates_select(cnx)

    # End time
    chrono.stop()

    return chrono.elapsed(), nb, size
예제 #2
0
def move_files(db):
    """
        Move files marked for deletion in a "trash" directory. 

        Args:
            db (text): Name of the file used for storing sqlite3 database

        Returns:
            nb (int): Number of files moved
            size(int): Size of deleted (trashed) files

    """

    cnx = sqlite3.connect(db)

    # Nb of marked files

    res = cnx.execute(
        "SELECT count(fid) FROM filelist WHERE marked_for_deletion = '1'")
    nb_to_delete = res.fetchone()[0]
    str_fmt = FMT_HIGH + "{}" + FMT_RESET + " files to delete/trash (total)."
    print(str_fmt.format(nb_to_delete))

    # Remaining files to delete

    res = cnx.execute(
        "SELECT count(fid) FROM filelist WHERE (marked_for_deletion = '1') AND (trashed IS NULL);"
    )
    nb_remaining = res.fetchone()[0]
    str_fmt = FMT_HIGH + "{}" + FMT_RESET + " remaining files to delete/trash."
    print(str_fmt.format(nb_remaining))

    # Selecting files to delete

    res = cnx.execute(
        "SELECT * FROM filelist WHERE (marked_for_deletion = '1') AND (trashed IS NULL)"
    )

    # Start time
    chrono = utils.Chrono()
    chrono.start()

    # Big loop

    nb_trash = 0
    nb_fail = 0
    nb = 0
    size_deleted = 0

    for row in res:

        nb = nb + 1

        fid, hash, _, path, name, orig_path, size, _, _, _, master, has_dup, _, _, _, _ = row

        original_file = os.path.join(path, name)
        rel_path = os.path.relpath(path, orig_path)
        copy_file = trash + os.sep + os.path.join(rel_path, name)
        trash_file_path = os.path.dirname(copy_file)

        try:
            if not (os.path.exists(trash_file_path)):
                os.makedirs(trash_file_path)
            # --- shutil.copy2(original_file, copy_file)
            shutil.move(original_file, copy_file)
            nb_trash = nb_trash + 1
            size_deleted = size_deleted + size
            cnx.execute(
                "UPDATE filelist SET trashed='1', delete_error=NULL WHERE fid = ?",
                (fid, ))

        except OSError as ose:

            nb_fail = nb_fail + 1
            err_num = ose.errno
            cnx.execute("UPDATE filelist SET delete_error=? WHERE fid = ?", (
                err_num,
                fid,
            ))
            #print("({}) {} --> {}".format(err_num, original_file, copy_file))

        # Where am I?

        if ((nb % 10) == 0):

            perc = ((nb_trash + nb_fail) / nb_remaining) * 100
            print(FMT_STR_TRASH_PROCESSING.format(nb_trash, nb_fail, perc,
                                                  chrono.elapsed()),
                  end="\r",
                  flush=True)
            cnx.commit()
            if ((nb % 1000) == 0):
                utils.checkpoint_db(cnx, "filelist_pre_hash", fid, commit=True)

    # Ends connection
    cnx.commit()
    cnx.close()

    return nb_trash, nb_fail, size_deleted
예제 #3
0
def pre_duplicates_rehash(cnx):
    """
        Recalculates full hash for duplicate candidates (files having the same pre-hash). You can use here
        the hash function you want but consider "md5" as the best ratio entropy/execution time. The updates
        are made directly in the database.

        Args:
            cnx (sqlite3.Connection): Connection object

        Returns:
            t (time): The execution time of this function
    """

    global last_step, last_id

    # Start time
    chrono = utils.Chrono()
    chrono.start()

    #
    # ---> Selection of pre_hashes present more than once
    #

    res = cnx.execute(
        "SELECT count(fid) FROM filelist WHERE pre_hash IN (SELECT pre_hash FROM filelist GROUP BY pre_hash HAVING COUNT(pre_hash) > 1 ORDER BY pre_hash);"
    )
    nb_total = res.fetchone()[0]

    if (last_step != "pre_duplicates_rehash"):

        # Here we start from the beginning and read all the files in DB
        nb = 0
        res = cnx.execute(
            "SELECT pre_hash FROM filelist GROUP BY pre_hash HAVING COUNT(pre_hash) > 1 ORDER BY pre_hash"
        )

    else:

        # Checkpoint/restart : we restart from last updated record (fid)

        res = cnx.execute(
            "SELECT count(fid) FROM filelist WHERE hash NOT NULL")
        nb = res.fetchone()[0]

        # Restart point

        res = cnx.execute(
            "SELECT pre_hash FROM filelist WHERE pre_hash >? GROUP BY pre_hash HAVING COUNT(pre_hash) > 1 ORDER BY pre_hash",
            (last_id, ))
        print("Restart from pre_hash {}".format(last_id))

    # Progression

    last_d = 0
    last_m = 0

    # Loop...

    for h in res:

        #
        # We look for all files that have the selected pre_hash
        #

        hash = h[0]
        r = cnx.execute("SELECT * FROM filelist WHERE pre_hash = ?", (hash, ))

        for row in r:

            # Here we need to go a bit further: having the same pre_hash
            # does not mean that files are identical; we need to calculate the complete hash

            fid = row[0]
            filepath = os.path.join(row[3], row[4])
            full_hash, _ = file_hash_calc(filepath, "md5", False)

            cnx.execute("UPDATE filelist set hash = ? WHERE fid = (?)",
                        (full_hash, fid))

            # Checkpoint

            last_step = "pre_duplicates_rehash"
            last_id = hash

            nb = nb + 1

        # Displaying progression and commit (occasionnaly, but only when we get a new hash)

        d = nb // 100
        m = nb // 1000

        if (d != last_d):

            last_d = d
            perc = (nb / nb_total) * 100
            print("Rehashing duplicate candidates #{} ({:.2f}%), {:.2f} sec".
                  format(nb, perc, chrono.elapsed()),
                  end="\r",
                  flush=True)

            if (m != last_m):
                last_m = m
                utils.checkpoint_db(cnx,
                                    "pre_duplicates_rehash",
                                    fid,
                                    commit=True)

    #
    #  ---> Last commit
    #

    utils.checkpoint_db(cnx, "pre_duplicates_rehash", "all", commit=True)

    # End time
    chrono.stop()

    return chrono.elapsed(), nb
예제 #4
0
def filelist_pre_hash(cnx, algo):
    """

        Calculates a "pre-hash" that is a hash calculated on the first bytes of the file. 

        Listen to me well: hash calculation can be long for... long files. Here we make a first *selection*
        where we eliminate files without duplicates. As a matter of fact, if the hash of the first
        bytes are not equals, it means that the files are... not equals for sure!

        On the other hand, files with the same pre-hash need further investigation and we'll 
        calculate in the next step the complete hash, but only for duplicate candidates. Then
        we avoid unnecessary long calculation, especially for big files.

        Though you can use any system-known hash function, we suggest to use only **md5** for 
        pre-hashing, because it has enough entropy for our usage, and it's quicker in most situations.
        Better algos will have better result, but here we only want some file duplicate candidate selection.

        Args:
            cnx (sqlite3.Connection): Connection object
            algo (text): Name of the hash algo to use. 

        Returns:
            t (time): The execution time of this function

    """

    global last_step, last_id

    # Start time
    chrono = utils.Chrono()
    chrono.start()

    #
    # ---> Main loop (on all files)
    #

    res = cnx.execute("select count(fid) from filelist;")
    nb_total = res.fetchone()[0]

    if (last_step != "filelist_pre_hash"):

        nb = 0
        r = cnx.execute("SELECT * FROM filelist ORDER BY fid")

    else:

        res = cnx.execute(
            "SELECT count(fid) FROM filelist WHERE fid <=? ORDER BY fid",
            (last_id, ))
        nb = res.fetchone()[0]
        r = cnx.execute("SELECT * FROM filelist WHERE fid >? ORDER BY fid",
                        (last_id, ))
        print("Restart from fid {}".format(last_id))

    for row in r:

        # Let's get the filepath of this element

        fid = row[0]
        filepath = os.path.join(row[3], row[4])

        try:

            file_stats = os.stat(filepath)
            file_size = file_stats.st_size
            h, _ = file_hash_calc(filepath, algo)
            cnx.execute(
                "UPDATE filelist SET size = ?, pre_hash = ? WHERE fid = (?)",
                (file_size, h, fid))

            # Checkpoint

            last_step = "filelist_pre_hash"
            last_id = fid

        except PermissionError as pe:

            #
            # Here we have an existing file but we have no right permission on it. Bad strike!
            #

            cnx.execute(
                "UPDATE filelist SET size = ?, access_denied = ? WHERE fid = (?)",
                (file_size, True, fid))

        except OSError as ose:

            #
            # Worst: we have an OS Error while retrieving file information or during hash calculation
            #
            # Example: We'll get an #22 error with an OneDrive file stored only in the cloud and not present on disk
            #

            cnx.execute(
                "UPDATE filelist SET os_errno = ?, os_strerror=? WHERE fid = (?)",
                (ose.errno, ose.strerror, fid))

        nb = nb + 1

        # Displaying progression and commit (occasionnaly)

        if ((nb % 100) == 0):

            perc = (nb / nb_total) * 100
            print(
                "Quick hash computing #{} files ({:.2f}%), {:.2f} sec".format(
                    nb, perc, chrono.elapsed()),
                end="\r",
                flush=True)
            if ((nb % 1000) == 0):
                utils.checkpoint_db(cnx, "filelist_pre_hash", fid, commit=True)

    #
    #  ---> Last commit
    #

    utils.checkpoint_db(cnx, "filelist_pre_hash", "all", commit=True)

    # End time
    chrono.stop()

    return chrono.elapsed()
예제 #5
0
def file_hash_calc(filename, algo_name, pre_hash=True):
    """

        Returns the hash of a file. You can choose any hash function that your Phython environment supports.

        It can be used for a full hash computation, or only a pre-hash (much quicker with large files). Pre-hash
        computes the hash only on the first bytes. As we are looking for duplicates, two files with a different
        pre-hash are... different! No need to compute the full hash.

        On the contrary, if the pre-hash is the same for some files, we calculate the full hash (with the entire
        file content) and then we could decide if files are duplicates or not.

        Args:

            filename (text): Absolute path for the file
            algo_name(text): Name of the hash algo we use ("md5", "sha1", and even "crc32")
            pre_hash (boolean): Indicates if we compute a pre-hash or not

        Returns:

            h (text): The hash value (hexa text)
            t (int): execution time (in seconds)

        .. Example:: file_hash_calc("test.txt", "md5")

    """

    # Start time

    chrono = utils.Chrono()
    chrono.start()

    #
    #  ---> Depending on the selected algo, let's calculate the hash
    #

    if (algo_name != "crc32"):

        # Here we have the 'hashlib' hashes
        # ---

        hl = getattr(hashlib, algo_name)()

        #
        # We calculate here only the first bytes for the pre_hash (to speed up the calculation and alors to avoid MemoryError with big files)
        #
        # For complete hash, we need to split the reading.
        #

        with open(filename, 'rb') as f:

            if (pre_hash):

                #
                # Pre-hash => Only on the first bytes
                #

                hl.update(f.read(io.DEFAULT_BUFFER_SIZE))

            else:

                #
                # Complete hash => split the file
                #

                while True:

                    #
                    # We use the read only the size of the block to avoid
                    # heavy ram usage. The file content is buffered.
                    #

                    data = f.read(io.DEFAULT_BUFFER_SIZE)

                    if not data:

                        # if we don't have any more data to read, stop.
                        break

                    # we partially calculate the hash
                    hl.update(data)

        #
        # We have exceptions: the 'shake' hashes need one argument
        # Else, for all other hashes, we don't need any argument adnd we
        # can calculate the hash with the hexdigest() method.
        #

        if (algo_name == "shake_128"):
            h = hl.hexdigest(128)
        elif (algo_name == "shake_256"):
            h = hl.hexdigest(256)
        else:
            h = hl.hexdigest()

    else:

        #
        # Here we have the 'binascii' hashes (CRC32). Included in an other library.
        #

        with open(filename, 'rb') as f:
            h = binascii.crc32(f.read(8192))

    # End time

    chrono.stop()

    #
    # ---> We return both hash and execution time (for info)
    #

    return h, chrono.elapsed()
예제 #6
0
def directory_lookup(cnx, basepath, master, protected):
    """

        Looks (hierarchically) for all files within the folder structure, and stores the path and the 
        name of each file. No file access is made (to save time).

        Args:
            cnx (sqlite3.Connection): Connection object
            basepath (text): Array of file paths we will look into.

        Returns:
            t (time): The execution time of this function

    """

    global last_step, last_id

    # Start time
    chrono = utils.Chrono()
    chrono.start()

    # Nb of files init
    nb = 0

    # Filepath init

    if (basepath == ""):
        basepath = "."

    #
    # ---> Files discovering. Thanks to Python, we just need to call an existing function...
    #

    for root, _, files in os.walk(basepath, topdown=True):

        #
        #  We just look for files, we don't process the directories
        #

        for name in files:

            # Hey, we got one (file)!

            nb = nb + 1
            cnx.execute(
                "INSERT INTO filelist(path, name, access_denied, original_path, master, protected)\
                            VALUES (?, ?, ?, ?, ?, ?)",
                (root, name, False, basepath, master, protected))

            # Checkpoint

            last_step = "directory_lookup"
            last_id = "in progress"

            # Displaying progression and commit (occasionnaly)

            if ((nb % 100) == 0):
                print("Discovering #{} files ({:.2f} sec)".format(
                    nb, chrono.elapsed()),
                      end="\r",
                      flush=True)
                if ((nb % 1000) == 0):
                    cnx.commit()
            """
            except PermissionError:

                cnx.execute("INSERT INTO filelist(pre_hash, path, name, access_denied)\
                                VALUES (?, ?, ?, ?)",("?", root, name, True))

            except OSError as ose:

                cnx.execute("INSERT INTO filelist(pre_hash, path, name, os_errno, os_strerror)\
                                VALUES (?, ?, ?, ?, ?)",("?", root, name, ose.errno, ose.strerror))
            """

    #
    # ---> Last commit
    #

    utils.checkpoint_db(cnx, "directory_lookup", basepath, commit=True)

    # End time
    chrono.stop()

    return chrono.elapsed()