'''Takes an iterable of bookids, and does all procsesing / thread management of said processing.''' def process_books(library, calibre_ids=None, book_text_files=None, multi_thread=False): bookids = [] if calibre_ids: for id in calibre_ids: book = library.get_book_cid(id) if not book: book = Book.Book(calibreid=id) library.add_book(book) bookids.add(book.id) if book_text_files: for id in book_text_files: book = library.get_book_textfile(id) if not book: book = Book.Book(textfile=id) library.add_book(book) bookids.append(book.id) if not multi_thread: #Just run through our tasks serially fingerprint_initializer(library, bookids) book_comparator(library, bookids) else: raise TBD('Multithreading is not yet implimented') #TODO: must add the completed scan uuid to books when finished if __name__ == '__main__': import zUnitTest zUnitTest.run_testcase(zUnitTest.ControllerTest)
if myhash < minhashes[h]: minhashes[h] = myhash else: hashedwords = len(words) hashes = OptimizeCompare.HashSequence() hashes.resize(hashedwords, 0) for i in xrange(hashedwords): hashes[i] = hash(words[i]) myhashes = OptimizeCompare.HashSequence() myhashes = OptimizeCompare.shingle_and_hash(hashes, Utility.myMasks.masks, L_tables, shingle_size) minhashes = array('l',myhashes) #Convert to native python type because this array is small, but will be accessed frequently! return minhashes def __generate_minhashes(self,words, shingle_size=5): L_tables = Compare.myMinHashParams.minhash_tables if len(words) < 4: return array('l',[]) elif len(words) < shingle_size: shingle_size = 1 return self.__shingle_and_hash(words, shingle_size, L_tables) #self.__find_mins(tables, L_tables, minhashes) if __name__ == '__main__': import zUnitTest zUnitTest.run_testcase(zUnitTest.FingerprintTest)