def remove_similar_files(self, file_list1, file_list2):
   
   if file_list1 == None or len(file_list1) == 0 or file_list2 == None or len(file_list2) == 0:
     return
   
   self.__db_lock.acquire()
   self.__similarities_lock.acquire()
   
   for file1 in file_list1:
   
     file1 = dir_tools.normalize(file1)
     
     # Make sure we have up-to-date info.
     if file1 in self.__similarities:
       del self.__similarities[file1]
     #self.__queue_query_at_top(file1)
     
     for file2 in file_list2:
       file2 = dir_tools.normalize(file2)
     
       self.__c.execute('''DELETE FROM similarity
                         WHERE ((file1 = ? AND file2 = ?)
                         OR (file1 = ? AND file2 = ?))
                      ''', (file2, file1, file1, file2))
       
       if file2 in self.__similarities:
         del self.__similarities[file2]
       #self.__queue_query_at_top(file2)
   
   self.__conn.commit()
   
   self.__similarities_lock.release()
   self.__db_lock.release()
 def __find_similar(self, filename):
 
   filename = dir_tools.normalize(filename)
   
   #####
   self.__db_lock.acquire()
   
   self.__c.execute('''SELECT * FROM similarity
                       WHERE (file1 = ? OR file2 = ?)
                       AND (percent >= ?)''',
                 (filename, filename, self.__threshold))
   
   sim = []
   
   for row in self.__c:
     file = row['file1']
     if file == filename:
       file = row['file2']
     
     sim.append(file)
       
   self.__db_lock.release()
   #####
   
   return sim
    def group_from_file(self, filename):

        filename = dir_tools.normalize(filename)

        for group in self.dupes:
            if filename in self.dupes[group]:
                return group

        return None
Пример #4
0
    def group_from_file(self, filename):

        filename = dir_tools.normalize(filename)

        for group in self.dupes:
            if filename in self.dupes[group]:
                return group

        return None
 def add_similar_files(self, file_list1, file_list2, percent):
   
   self.__db_lock.acquire()
   self.__similarities_lock.acquire()
   
   for file1 in file_list1:
     file1 = dir_tools.normalize(file1)
     
     for file2 in file_list2:
       file2 = dir_tools.normalize(file2)
       
       if file1 == file2:
         continue
       
       self.__c.execute('''SELECT * FROM similarity
                         WHERE ((file1 = ? AND file2 = ?)
                         OR (file1 = ? AND file2 = ?))
                      ''', (file2, file1, file1, file2))
       # Only add if we don't have the entry normal or reversed.
       if self.__c.fetchone() == None:
         self.__c.execute('''INSERT OR IGNORE INTO similarity
                           VALUES(?, ?, ?)
                        ''', (file1, file2, percent))
       
       # Make sure we have up-to-date info.
       if file1 in self.__similarities:
         del self.__similarities[file1]
       
       if file2 in self.__similarities:
         del self.__similarities[file2]
       
       #self.__queue_query_at_top(file1)
       #self.__queue_query_at_top(file2)
   
   self.__conn.commit()
   
   self.__similarities_lock.release()
   self.__db_lock.release()
    def load_csv(self, csv_file=None):
        if csv_file == None:
            csv_file = self.csv_file

        self.dupes.clear()

        # Load in our CSV.
        dupes_csv = DictReader(open(csv_file, "rb"))

        for row in dupes_csv:
            group = int(row["Group"])
            filename = dir_tools.normalize(row["File"])

            self.dupes[group].add(filename)
    def add_file(self, filename, group=None):
        filename = dir_tools.normalize(filename)

        test_group = self.group_from_file(filename)
        if not test_group == None:
            _LOGGER.error("File %s already a member of group %d. Try merging groups." % (filename, test_group))
            return None

        if group == None:
            group = max(self.dupes.keys()) + 1

        self.dupes[group].add(filename)

        return group
Пример #8
0
    def load_csv(self, csv_file=None):
        if csv_file == None:
            csv_file = self.csv_file

        self.dupes.clear()

        # Load in our CSV.
        dupes_csv = DictReader(open(csv_file, 'rb'))

        for row in dupes_csv:
            group = int(row['Group'])
            filename = dir_tools.normalize(row['File'])

            self.dupes[group].add(filename)
    def remove_file(self, filename):
        filename = dir_tools.normalize(filename)

        group = self.group_from_file(filename)
        if group == None:
            _LOGGER.error("File %s not in any duplicate group, cannot remove." % filename)
            return

        # If we have two or fewer and attempt to remove a file,
        # that would leave us with a group with one file, which
        # isn't much of a duplicate group. So just save the trouble
        # and kill it all right here.
        if len(self.dupes[group]) <= 2:
            self.remove_group(group)
        else:
            self.dupes[group].discard(filename)
Пример #10
0
    def add_file(self, filename, group=None):
        filename = dir_tools.normalize(filename)

        test_group = self.group_from_file(filename)
        if not test_group == None:
            _LOGGER.error(
                "File %s already a member of group %d. Try merging groups." %
                (filename, test_group))
            return None

        if group == None:
            group = max(self.dupes.keys()) + 1

        self.dupes[group].add(filename)

        return group
Пример #11
0
    def remove_file(self, filename):
        filename = dir_tools.normalize(filename)

        group = self.group_from_file(filename)
        if group == None:
            _LOGGER.error(
                "File %s not in any duplicate group, cannot remove." %
                filename)
            return

        # If we have two or fewer and attempt to remove a file,
        # that would leave us with a group with one file, which
        # isn't much of a duplicate group. So just save the trouble
        # and kill it all right here.
        if len(self.dupes[group]) <= 2:
            self.remove_group(group)
        else:
            self.dupes[group].discard(filename)
 def __store_similarities(self, filename, similarities):
   filename = dir_tools.normalize(filename)
   
   #####
   self.__similarities_lock.acquire()
   
   while self.__accessed[self.__cache_frame][1] == True:
     self.__accessed[self.__cache_frame][1] = False
     self.__cache_frame += 1
     
     if self.__cache_frame >= _CACHE_SIZE:
       self.__cache_frame = 0
   
   old_name = self.__accessed[self.__cache_frame][0]
   try:
     del self.__similarities[old_name]
   except:
     pass
   
   self.__similarities[filename] = similarities
   self.__accessed[self.__cache_frame] = [filename, True]
   
   self.__similarities_lock.release()
 def __get_similarities(self, filename):
   filename = dir_tools.normalize(filename)
 
   #####
   self.__similarities_lock.acquire()
   
   if filename in self.__similarities:
     sim = self.__similarities[filename]
     
     for i, (name, accessed) in enumerate(self.__accessed):
       if name == filename:
         self.__accessed[i][1] = True
         break
     
     self.__similarities_lock.release()
     return sim
     
   self.__similarities_lock.release()
   #####
   
   sim = self.__find_similar(filename)
   self.__store_similarities(filename, sim)
   
   return sim
 def is_file_in_group(self, filename, group):
     filename = dir_tools.normalize(filename)
     return filename in self.dupes[group]
Пример #15
0
 def is_file_in_group(self, filename, group):
     filename = dir_tools.normalize(filename)
     return filename in self.dupes[group]