def remove_similar_files(self, file_list1, file_list2): if file_list1 == None or len(file_list1) == 0 or file_list2 == None or len(file_list2) == 0: return self.__db_lock.acquire() self.__similarities_lock.acquire() for file1 in file_list1: file1 = dir_tools.normalize(file1) # Make sure we have up-to-date info. if file1 in self.__similarities: del self.__similarities[file1] #self.__queue_query_at_top(file1) for file2 in file_list2: file2 = dir_tools.normalize(file2) self.__c.execute('''DELETE FROM similarity WHERE ((file1 = ? AND file2 = ?) OR (file1 = ? AND file2 = ?)) ''', (file2, file1, file1, file2)) if file2 in self.__similarities: del self.__similarities[file2] #self.__queue_query_at_top(file2) self.__conn.commit() self.__similarities_lock.release() self.__db_lock.release()
def __find_similar(self, filename): filename = dir_tools.normalize(filename) ##### self.__db_lock.acquire() self.__c.execute('''SELECT * FROM similarity WHERE (file1 = ? OR file2 = ?) AND (percent >= ?)''', (filename, filename, self.__threshold)) sim = [] for row in self.__c: file = row['file1'] if file == filename: file = row['file2'] sim.append(file) self.__db_lock.release() ##### return sim
def group_from_file(self, filename): filename = dir_tools.normalize(filename) for group in self.dupes: if filename in self.dupes[group]: return group return None
def add_similar_files(self, file_list1, file_list2, percent): self.__db_lock.acquire() self.__similarities_lock.acquire() for file1 in file_list1: file1 = dir_tools.normalize(file1) for file2 in file_list2: file2 = dir_tools.normalize(file2) if file1 == file2: continue self.__c.execute('''SELECT * FROM similarity WHERE ((file1 = ? AND file2 = ?) OR (file1 = ? AND file2 = ?)) ''', (file2, file1, file1, file2)) # Only add if we don't have the entry normal or reversed. if self.__c.fetchone() == None: self.__c.execute('''INSERT OR IGNORE INTO similarity VALUES(?, ?, ?) ''', (file1, file2, percent)) # Make sure we have up-to-date info. if file1 in self.__similarities: del self.__similarities[file1] if file2 in self.__similarities: del self.__similarities[file2] #self.__queue_query_at_top(file1) #self.__queue_query_at_top(file2) self.__conn.commit() self.__similarities_lock.release() self.__db_lock.release()
def load_csv(self, csv_file=None): if csv_file == None: csv_file = self.csv_file self.dupes.clear() # Load in our CSV. dupes_csv = DictReader(open(csv_file, "rb")) for row in dupes_csv: group = int(row["Group"]) filename = dir_tools.normalize(row["File"]) self.dupes[group].add(filename)
def add_file(self, filename, group=None): filename = dir_tools.normalize(filename) test_group = self.group_from_file(filename) if not test_group == None: _LOGGER.error("File %s already a member of group %d. Try merging groups." % (filename, test_group)) return None if group == None: group = max(self.dupes.keys()) + 1 self.dupes[group].add(filename) return group
def load_csv(self, csv_file=None): if csv_file == None: csv_file = self.csv_file self.dupes.clear() # Load in our CSV. dupes_csv = DictReader(open(csv_file, 'rb')) for row in dupes_csv: group = int(row['Group']) filename = dir_tools.normalize(row['File']) self.dupes[group].add(filename)
def remove_file(self, filename): filename = dir_tools.normalize(filename) group = self.group_from_file(filename) if group == None: _LOGGER.error("File %s not in any duplicate group, cannot remove." % filename) return # If we have two or fewer and attempt to remove a file, # that would leave us with a group with one file, which # isn't much of a duplicate group. So just save the trouble # and kill it all right here. if len(self.dupes[group]) <= 2: self.remove_group(group) else: self.dupes[group].discard(filename)
def add_file(self, filename, group=None): filename = dir_tools.normalize(filename) test_group = self.group_from_file(filename) if not test_group == None: _LOGGER.error( "File %s already a member of group %d. Try merging groups." % (filename, test_group)) return None if group == None: group = max(self.dupes.keys()) + 1 self.dupes[group].add(filename) return group
def remove_file(self, filename): filename = dir_tools.normalize(filename) group = self.group_from_file(filename) if group == None: _LOGGER.error( "File %s not in any duplicate group, cannot remove." % filename) return # If we have two or fewer and attempt to remove a file, # that would leave us with a group with one file, which # isn't much of a duplicate group. So just save the trouble # and kill it all right here. if len(self.dupes[group]) <= 2: self.remove_group(group) else: self.dupes[group].discard(filename)
def __store_similarities(self, filename, similarities): filename = dir_tools.normalize(filename) ##### self.__similarities_lock.acquire() while self.__accessed[self.__cache_frame][1] == True: self.__accessed[self.__cache_frame][1] = False self.__cache_frame += 1 if self.__cache_frame >= _CACHE_SIZE: self.__cache_frame = 0 old_name = self.__accessed[self.__cache_frame][0] try: del self.__similarities[old_name] except: pass self.__similarities[filename] = similarities self.__accessed[self.__cache_frame] = [filename, True] self.__similarities_lock.release()
def __get_similarities(self, filename): filename = dir_tools.normalize(filename) ##### self.__similarities_lock.acquire() if filename in self.__similarities: sim = self.__similarities[filename] for i, (name, accessed) in enumerate(self.__accessed): if name == filename: self.__accessed[i][1] = True break self.__similarities_lock.release() return sim self.__similarities_lock.release() ##### sim = self.__find_similar(filename) self.__store_similarities(filename, sim) return sim
def is_file_in_group(self, filename, group): filename = dir_tools.normalize(filename) return filename in self.dupes[group]