class Hashedbitset(): def __init__(self, size): self._value = Bitset(size) self._size = size self._hasher = Hasher() def Add(self, item): self._value.Add(self._hasher.hash(item, 1)[0]) def Size(self): return self._value.Size() def Contains(self, item): n = self._hasher.hash(item, 1)[0] return self._value.Contains(n)
def to_data(filename): h = Hasher(2 ** 20) df_all = pd.merge(pd.merge(df, user_df, on='u_st_2_uid', how='left'), item_df, on='d_st_2_did', how='left') print(df_all.dtypes) print('*' * 18) print(df_all.count()) print('*' * 18) df_json = df_all.to_dict('records') # multiprocessing p = Pool(10) results = [] for i, feature_value_dict in enumerate(df_json): # if i > 100: # break results.append(p.apply_async(to_p, args=(i, feature_value_dict, h))) print('*' * 18) print(len(results)) print('*' * 18) p.close() p.join() writer = tf.python_io.TFRecordWriter(filename) for r in results: writer.write(r.get()) writer.close()
def write_to_cache_without_js(self): process = CrossPlatformProcess(self) (stdout, stderr) = process.run_sync(r'gulp -v') if process.failed or not GulpVersion(stdout).supports_tasks_simple(): raise Exception( "Gulp: Could not get the current gulp version or your gulp CLI version is lower than 3.7.0" ) (stdout, stderr) = process.run_sync(r'gulp --tasks-simple') gulpfile = self.get_gulpfile_path(self.working_dir) if not stdout: raise Exception( "Gulp: The result of `gulp --tasks-simple` was empty") self.write_cache_file({ gulpfile: { "sha1": Hasher.sha1(gulpfile), "tasks": dict((task, { "name": task, "dependencies": "" }) for task in stdout.split("\n") if task) } })
def fetch_json(self): cache_file = CacheFile(self.working_dir) gulpfile = self.get_gulpfile_path(self.working_dir) data = None if cache_file.exists(): filesha1 = Hasher.sha1(gulpfile) data = cache_file.read() if gulpfile in data and data[gulpfile]["sha1"] == filesha1: return data[gulpfile]["tasks"] self.callcount += 1 if self.callcount == 1: return self.write_to_cache() if data is None: raise Exception("Could not write to cache gulpfile.") if gulpfile in data: raise Exception( "Sha1 from gulp cache ({0}) is not equal to calculated ({1}).\nTry erasing the cache and running Gulp again." .format(data[gulpfile]["sha1"], filesha1)) else: raise Exception( "Have you renamed a folder?.\nSometimes Sublime doesn't update the project path, try removing the folder from the project and adding it again." )
def fetch_json(self): jsonfilename = os.path.join(self.working_dir, GulpCommand.cache_file_name) gulpfile = self.get_gulpfile_path(self.working_dir) data = None if os.path.exists(jsonfilename): filesha1 = Hasher.sha1(gulpfile) json_data = codecs.open(jsonfilename, "r", "utf-8", errors='replace') try: data = json.load(json_data) if gulpfile in data and data[gulpfile]["sha1"] == filesha1: return data[gulpfile]["tasks"] finally: json_data.close() self.callcount += 1 if self.callcount == 1: return self.write_to_cache() if data is None: raise Exception("Could not write to cache gulpfile.") raise Exception("Sha1 from gulp cache ({0}) is not equal to calculated ({1}).\nTry erasing the cache and running Gulp again.".format(data[gulpfile]["sha1"], filesha1))
def __init__(self, init_dic): self.logger = getLogger() if not self.__isValidInfo(init_dic): self.logger.error( "Failed to init RequestURLCrawler : Invalid input information") exit(1) self.info_dic = init_dic self.cursor = None self.req_url_queue = [ ] # unvisited seeds (minimum heap ordered by page no.) # heappush(req_url_queue, (guid_hash, url_data)) self.url_data_dic = dict( ) # visited + fully parsed data, dic[view_guid_hash] = URLData() self.hasher = Hasher() self.url_factory = None self.html_parser = None self.xml_producer = XMLPrinter(OUTPUT_PATH)
def spi_response(self, response, *args, **kwargs): '''Response of the spi_request are handled here ''' if 'text/html' in response.headers['Content-Type']: hash_val = Hasher.HashMD5(response.content) if hash_val not in self.URLhash: self.URLhash.add(hash_val) self.URLset.union(Links.parse_link(response))
class TestHash(unittest.TestCase): def setUp(self): self.hasher = Hasher("words.txt", nwords=3, delimeter="-") def test_smoke_test(self): data = """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi""" self.assertEqual(self.hasher.process(data), "Isaac-Bremely-Trueheartedly") def test_long_text(self): data = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. In non egestas dolor. Nulla molestie sed justo sed elementum. Aliquam erat volutpat. Morbi odio lectus, consequat nec nisl eu, vulputate convallis ex. Etiam faucibus lorem non tempus malesuada. Praesent aliquet, ligula et fringilla euismod, nulla felis accumsan est, bibendum semper ligula ligula ut leo. Donec nibh metus, fermentum in fermentum id, vulputate vel sapien. Quisque gravida eros in rhoncus convallis. Proin eu dui finibus, maximus nisl sed, dignissim odio. Fusce vel est eu justo imperdiet suscipit eu mattis turpis. Nam sed odio sollicitudin, pulvinar purus non, mollis nulla. Nam sed euismod orci, sed vestibulum mauris. Curabitur cursus est in ornare mollis. Nulla urna turpis, tincidunt non tempor eu, auctor et nisi. Vivamus lobortis elit vel dolor pharetra blandit. Morbi in feugiat odio. In nec augue velit. Suspendisse interdum purus in metus luctus, eu rhoncus mauris porta. Aliquam pharetra, elit vitae convallis congue, libero velit malesuada felis, sed sodales turpis enim sed leo. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam auctor tortor ut semper pretium. Aenean sed malesuada nisi, eget venenatis enim. Suspendisse in sagittis arcu, eu tristique turpis. Mauris dignissim eget ex sit amet egestas. Donec blandit dolor quis sapien aliquet, id rutrum lorem ultrices. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut id scelerisque sem. Cras bibendum, lorem vel dapibus placerat, odio mauris finibus sapien, a efficitur purus massa et metus. Vestibulum dolor elit, ultrices quis enim in, convallis sagittis metus. Phasellus lacinia justo elit, non elementum augue pellentesque eu. Sed nec eleifend enim. Quisque blandit felis quis porta sodales. Morbi id rutrum tellus. Integer varius felis non luctus placerat. Praesent a lacus est. Nulla sollicitudin volutpat erat, pulvinar sagittis dui imperdiet sed. Nulla tempor, leo vel malesuada ullamcorper, libero eros rutrum sem, non ullamcorper tortor ex sed nibh. Cras ac lectus vitae elit dignissim rutrum. Etiam non semper mauris. Donec sem velit, elementum sit amet nibh a, pellentesque maximus velit. Nam ac velit ligula. """ self.assertEqual(self.hasher.process(data), "Blips-Laggingly-Trochilidae") def test_short_text(self): data = """s""" self.assertEqual(self.hasher.process(data), "Abaca-Abusage-Blisses") def test_non_text_data(self): data = {"hello": 1, "other": "lorem"} self.assertEqual(self.hasher.process(data), "Interindividual-Fastbacks-Allochetite") def test_hash_distribution(self): results = defaultdict(list) collisions = 0 with open("words.txt") as input: for line in input: hashed = self.hasher.process(line) if hashed in results: collisions += 1 results[hashed].append(line) print("Collided:") print({k: v for k, v in results.items() if len(v) > 1}) # words has 370102 unique words, expect very few collisions self.assertLessEqual(collisions, 5)
class Bloomfilter(): def __init__(self, size, keys): self._value = Bitset(size) self._size = size self._keys = keys self._hasher = Hasher() def Add(self, item): [self._value.Add(i) for i in self._hasher.hash(item, self._keys)] def Size(self): return self._value.Size() def Contains(self, item): n = self._hasher.hash(item, self._keys) for i in n: if not self._value.Contains(i): return False return True
def spi_response(self, response): '''Response of the spi_request are handled here ''' if 'text/html' in response.headers[ 'Content-Type'] and response.status_code == 200: hash_val = Hasher.HashMD5(response.content) if self.redis.getVariable(hash_val) is None: if self.database.isConn(): self.database.saveData(hash=hash_val, url=response.url, content=response) self.redis.setVariable(hash_val, response.url) [self.URLset.put(link) for link in Links.parse_link(response)]
def write_to_cache_without_js(self): process = CrossPlatformProcess(self.working_dir) (stdout, stderr) = process.run_sync(r'gulp -v') if process.failed or not GulpVersion(stdout).supports_tasks_simple(): raise Exception("Gulp: Could not get the current gulp version or your gulp CLI version is lower than 3.7.0") (stdout, stderr) = process.run_sync(r'gulp --tasks-simple') gulpfile = self.get_gulpfile_path(self.working_dir) if not stdout: raise Exception("Gulp: The result of `gulp --tasks-simple` was empty") CacheFile(self.working_dir).write({ gulpfile: { "sha1": Hasher.sha1(gulpfile), "tasks": dict((task, { "name": task, "dependencies": "" }) for task in stdout.split("\n") if task) } })
def fetch_json(self): cache_file = CacheFile(self.working_dir) gulpfile = self.get_gulpfile_path(self.working_dir) data = None if cache_file.exists(): filesha1 = Hasher.sha1(gulpfile) data = cache_file.read() if gulpfile in data and data[gulpfile]["sha1"] == filesha1: return data[gulpfile]["tasks"] self.callcount += 1 if self.callcount == 1: return self.write_to_cache() if data is None: raise Exception("Could not write to cache gulpfile.") if gulpfile in data: raise Exception("Sha1 from gulp cache ({0}) is not equal to calculated ({1}).\nTry erasing the cache and running Gulp again.".format(data[gulpfile]["sha1"], filesha1)) else: raise Exception("Have you renamed a folder?.\nSometimes Sublime doesn't update the project path, try removing the folder from the project and adding it again.")
queue_hashed = queue.Queue() queue_ext_path = queue.Queue() queue_csv = queue.Queue() queue_csved = queue.Queue() queue_blk = queue.Queue() queue_mem = queue.Queue() queue_memed = queue.Queue() queue_rslt = queue.Queue() queue_elastic = queue.Queue() see = Seeker(queue_dis, IN_DIR, BASE_NAME, CHECK_TIME) dis = Dispatcher(queue_dis, queue_extrac, queue_extraced, queue_ext_path, queue_av, queue_hash, queue_hashed, queue_csv, queue_csved, queue_blk, queue_mem, queue_memed, queue_elastic, IN_DIR, WORK_DIR, OUT_DIR, DIR_OUT) has = Hasher(queue_hash, queue_hashed, IN_DIR, WORK_DIR, BLOCK_SIZE_HASH) ext = Extractor(queue_extrac, queue_extraced, queue_ext_path, IN_DIR, WORK_DIR) csv = Csver(queue_csv, queue_csved, WORK_DIR, OUT_DIR) blk = Bulker(queue_blk, queue_extraced, WORK_DIR, OUT_DIR) mem = Memer(queue_mem, queue_extraced, IN_DIR, WORK_DIR, OUT_DIR) #tim = Timeliner(queue_extrac,WORK_DIR,OUT_DIR) avc = Avcheck(queue_av, WORK_DIR, OUT_DIR) #elas = Elasticer(queue_elastic,WORK_DIR,OUT_DIR) see.start() dis.start() has.start() ext.start() csv.start() #blk.start()
def generateRepostsForAll(self, count_per_post=1, res=None, rot=None, asp=None, crop=None, uid=None, seed=None): '''generates reposts for every single non repost image in the image directory''' names = list( filter(lambda x: '_REPOST_' not in x, self.__imageToHash.keys())) self.vPrint('generating ' + str(len(names)) + ' reposts') interrupted = False try: for i, name in enumerate(sorted(names)): repname = (str(uid) if uid else '') + '_REPOST_' + name if count_per_post == 1: if repname in self.__imageToHash and repname in self.__imageToText: continue elif count_per_post > 1: if (str(count_per_post - 1) + repname) in self.__imageToHash and \ (str(count_per_post - 1) + repname) in self.__imageToText: continue else: return if i < 30 or i % 10 == 0: self.vPrint('partial: %5d/%d' % (i, len(names))) try: target_path = join(self.img_dir, name) loc = join(self.img_dir, repname) bad_imgs = generate_bad_repost(target_path, count=(count_per_post), res=res, rot=rot, asp=asp, crop=crop, save_loc=loc, seed=(seed + i)) if not isinstance(bad_imgs, list): bad_imgs = [(repname, bad_imgs)] for newrepname, bad_img in bad_imgs: bad_img_hash = Hasher.hashImage( bad_img, self.__imagehash_method) bad_img_text = OCR.read2Normalized(bad_img) self.__imageToHash[newrepname] = bad_img_hash self.__imageToText[newrepname] = bad_img_text except FileNotFoundError as e: print(e) print("skipped an image that doesn't exist") continue except UnidentifiedImageError as e: print(e) print('skipped an unidentified image') continue self.vPrint('done!') except KeyboardInterrupt: self.vPrint('interrupted!') interrupted = True finally: self.saveProcessedDataToCache() self.vPrint('saved!') return not interrupted
from hasher import Hasher hasher = Hasher('') print(hasher.hash('my name is jack'))
def main(): args = parse_args() library_paths = args.paths if not library_paths: logging.error('no libraries specified') last_library_path = osxphotos.utils.get_last_library_path() system_library_path = osxphotos.utils.get_system_library_path() resp = input(f"use last .photoslibrary ({last_library_path}) [Y/n] ") if not resp or resp.lower() == 'y': library_paths.append(last_library_path) else: exit(2) db_session = fetch_or_initialize_db(args.db_path) applephotos, directories = fetch_libraries(library_paths, db_session) photos, videos, albums = fetch_photos(applephotos[0]) # TODO # TODO replace these dry-run guards with decorators if args.dry_run: logging.info('[dry-run] skipping photo persistence') else: logging.info('Persisting photo data') persist_photos(photos, db_session) hasher = Hasher() if args.dry_run: logging.info('[dry-run] skipping image encoding') else: logging.info("Encoding images with imagededup") imagededup_encodings = hasher.imagededup_encode(photos) logging.info("Encoding images with imagehash") imagehash_encodings = hasher.imagehash_encode(photos) logging.info('Persisting photo encodings') encodings = [] for photo in photos: photo_id = photo.id for hash_name, value in imagededup_encodings[photo_id].items(): enc = Encoding(photo_id=photo_id, hash_library=HashLibrary.imagededup, \ algorithm=get_hash_algo(hash_name), value=value) encodings.append(enc) for hash_name, value in imagehash_encodings[photo_id].items(): enc = Encoding(photo_id=photo_id, hash_library=HashLibrary.imagehash, \ algorithm=get_hash_algo(hash_name), value=value) encodings.append(enc) db_session.add_all(encodings) db_session.commit() if args.dry_run: logging.info('[dry-run] skipping deduplication check and persistence') else: pass
class SQL: app = Flask(__name__) mysql = MySQL() hasher = Hasher() def __init__(self): self.app.config['MYSQL_DATABASE_USER'] = '******' self.app.config['MYSQL_DATABASE_PASSWORD'] = '' self.app.config['MYSQL_DATABASE_DB'] = 'library' self.app.config['MYSQL_DATABASE_HOST'] = 'localhost' def checkUser(self, email, password): self.mysql.init_app(self.app) query = "SELECT * FROM `users`" cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No username/password') else: for row in r: emailUser = unicode(row['email']) passwordUser = unicode(row['password']) if self.hasher.compareStrings(email, emailUser) and password == passwordUser: return True return False except: print('Error CheckUser') def getUser(self, email, password): self.mysql.init_app(self.app) query = "SELECT * FROM `users`" cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No username/password') else: for row in r: emailUser = unicode(row['email']) passwordUser = unicode(row['password']) if self.hasher.compareStrings(email, emailUser) and password == passwordUser: user = {} user['iduser'] = row['id'] user['user'] = unicode(row['user']) user['guser'] = row['guser'] return user return False except: print('Error getUser') def registerUser(self, email, password, username, gUser = 0): self.mysql.init_app(self.app) query = ''' INSERT INTO `users`(`id`, `user`, `password`, `email`, `guser`) VALUES (NULL,''' + "'" + username + "', '" + password + "', '" + email + "', " + str(gUser) + ")" try: con = self.mysql.connect() cur = con.cursor() cur.execute(query) con.commit() return True except: return False def updateUser(self, data, section, idUser): self.mysql.init_app(self.app) column = "" idUserStr = str(idUser) if section == 'email': column = '`email`' elif section == 'user': column = '`user`' elif section == 'pass': column = '`user`' query = "UPDATE `users` SET " + section + " = '" + data + "' WHERE id = " + idUserStr try: con = self.mysql.connect() cur = con.cursor() cur.execute(query) con.commit() return True except: return False def getHomeBook(self, idUser): self.mysql.init_app(self.app) query = ''' SELECT `books`.`id` , `books`.`photo` , `books`.`bfile` , `books`.`bname` FROM `books` INNER JOIN `readings` ON `books`.`id` = `readings`.`idbook` WHERE `readings`.`iduser` =''' + str(idUser) + ''' ORDER BY `readings`.`lastreading` DESC LIMIT 1''' cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No books') else: return r except: print('Error getHomeBooks') def getAllBooks(self): self.mysql.init_app(self.app) query = '''SELECT `id` , `photo` , `bname` FROM `books` ORDER BY `id` DESC''' cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No books') else: return r except: print('Error getAllBooks') def getReadingsBooks(self, iduser): self.mysql.init_app(self.app) idUserStr = str(iduser) query = '''SELECT `books`.`id` , `books`.`photo` , `books`.`bname` FROM `books` INNER JOIN `readings` on `books`.`id` = `readings`.`idbook` WHERE `readings`.`iduser` like ''' + idUserStr + ''' ORDER BY `readings`.`lastreading` DESC''' cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No books') else: return r except: print('Error getReadingsBooks') def getReadLaterBooks(self, iduser): self.mysql.init_app(self.app) idUserStr = str(iduser) query = '''SELECT `books`.`id` , `books`.`photo` , `books`.`bname` FROM `books` INNER JOIN `read_later` on `books`.`id` = `read_later`.`idbook` WHERE `read_later`.`iduser` like ''' + idUserStr cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No books') else: return r except: print('Error getReadingsBooks') def getBook(self, idBook): self.mysql.init_app(self.app) query = '''SELECT `books`.`id`, `books`.`photo`, `books`.`bname`, `books`.`synopsis`, `genres`.`genre`, `books`.`idauthor` FROM `books` INNER JOIN `genres` ON `books`.`idgenre` = `genres`.`id` WHERE `books`.`id` = ''' + idBook cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No book') else: return r except: print('Error getBook') def getAuthor(self, idAuthor): self.mysql.init_app(self.app) query = "SELECT * FROM `author` WHERE `id` = " + str(idAuthor) cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No author') else: return r except: print('Error getAuthor') def getSimilarBooksByBook(self, idBook): self.mysql.init_app(self.app) idBookStr = str(idBook) query = ''' SELECT t.`id`, t.`photo`, t.`bname`, t.`synopsis`, `genres`.`genre`, t.`idauthor` from ((SELECT * FROM `books` as book WHERE `idgenre` like (SELECT `idgenre` from `books` WHERE `id` = ''' + idBookStr + ''')) UNION (SELECT * FROM `books` as book WHERE `idauthor` like (SELECT `idauthor` from `books` where `id` = ''' + idBookStr + ''')) UNION (SELECT * FROM `books` as book WHERE `idcollect` like (SELECT `idcollect` from `books` where `id` = ''' + idBookStr + '''))) as t INNER JOIN `genres` ON t.`idgenre` = `genres`.`id` WHERE t.`idgenre` = `genres`.`id` AND t.`id` NOT LIKE ''' + idBookStr + ''' ORDER BY RAND() LIMIT 6 ''' cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No Books') else: return r except: print('Error getSimilarBooksByBook') def getBooksByAuthor(self, idAuthor): self.mysql.init_app(self.app) idAuthorStr = str(idAuthor) query = "SELECT * FROM `books` WHERE `idauthor` =" + idAuthorStr + ''' ORDER BY RAND() LIMIT 6''' cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No Books') else: return r except: print('Error getBooksByAuthor') def searchBooks(self, column, words): self.mysql.init_app(self.app) if column == "name": query = "SELECT * FROM `books` WHERE `bname` like '%" + words + "%'" elif column == "genre": query = ''' SELECT * FROM `books` WHERE `idgenre` like (SELECT `id` FROM `genres` WHERE `genre` like '%''' + words + "%')" elif column == "author": query = ''' SELECT * FROM `books` WHERE `idauthor` like (SELECT `id` FROM `author` WHERE `first` like '%''' + words + "%' OR `last` like '%" + words + "%')" elif column == "collection": query = ''' SELECT * FROM `books` WHERE `idcollect` like (SELECT `id` FROM `collections` WHERE `namecollection` like '%''' + words + "%')" cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No Books') else: return r except: print('Error searchBooks') def searchReadLater(self, words, idUser): self.mysql.init_app(self.app) idUserStr = str(idUser) query = ''' SELECT * FROM `books` INNER JOIN `read_later` ON `books`.`id` = `read_later`.`idbook` WHERE `read_later`.`iduser` = ''' + idUserStr + ''' AND `books`.`bname` like''' + "'%" + words + "%'" cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No Books') else: return r except: print('Error searchBooks') def searchPendings(self, words, idUser): self.mysql.init_app(self.app) idUserStr = str(idUser) query = ''' SELECT * FROM `books` INNER JOIN `readings` ON `books`.`id` = `readings`.`idbook` WHERE `readings`.`iduser` = ''' + idUserStr + ''' AND `books`.`bname` like''' + "'%" + words + "%'" cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No Books') else: return r except: print('Error searchBooks') def checkReadLater(self, idUser, idBook): self.mysql.init_app(self.app) idBookStr = str(idBook) idUserStr = str(idUser) query = ''' SELECT * FROM `read_later` WHERE `iduser` like ''' + idUserStr + ''' AND `idbook` like ''' + idBookStr cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: return False else: return True except: print('Error checkReadLater') def addReadLater(self, idUser, idBook): self.mysql.init_app(self.app) idBookStr = str(idBook) idUserStr = str(idUser) query = ''' INSERT INTO `read_later`(`id`, `iduser`, `idbook`) VALUES (NULL, ''' + idUserStr + ", " + idBookStr + ")" try: con = self.mysql.connect() cur = con.cursor() cur.execute(query) con.commit() return True except: return False def removeReadLater(self, idUser, idBook): self.mysql.init_app(self.app) idBookStr = str(idBook) idUserStr = str(idUser) query = ''' DELETE FROM `read_later` WHERE `iduser` like ''' + idUserStr + ''' AND `idbook` like ''' + idBookStr try: con = self.mysql.connect() cur = con.cursor() cur.execute(query) con.commit() return True except: return False def getBfile(self, idbook): self.mysql.init_app(self.app) idBookStr = str(idbook) query = "SELECT `bfile` FROM `books` WHERE `id` =" + idBookStr cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: print('No Books') else: return r except: print('Error getSimilarBooksByBook') def getAlines(self, idbook, iduser): self.mysql.init_app(self.app) idBookStr = str(idbook) idUserStr = str(iduser) query = ''' SELECT `alines` FROM `readings` WHERE `iduser` = ''' + idUserStr + ''' AND `idbook` = ''' + idBookStr cur = self.mysql.connect().cursor() cur.execute(query) try: r = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in cur.fetchall()] if len(r) == 0: return self.insertAlines(idbook,iduser) else: self.updateDateLastReading(idbook, iduser) return r except: print('Error getAlines') def insertAlines(self, idbook, iduser): self.mysql.init_app(self.app) idBookStr = str(idbook) idUserStr = str(iduser) query = ''' INSERT INTO `readings`(`id`, `iduser`, `idbook`, `alines`, `lastreading`) VALUES (NULL''' + ", " + idUserStr + ", " + idBookStr + ", 0, NOW())" try: con = self.mysql.connect() cur = con.cursor() cur.execute(query) con.commit() return self.getAlines(idbook,iduser) except: return False def updateDateLastReading(self, idbook, iduser): self.mysql.init_app(self.app) idBookStr = str(idbook) idUserStr = str(iduser) query = ''' UPDATE `readings` SET `lastreading`= NOW() WHERE `iduser` like ''' + idUserStr + ''' AND `idbook` like ''' + idBookStr try: con = self.mysql.connect() cur = con.cursor() cur.execute(query) con.commit() return True except: return False def updateAlines(self, idbook, iduser, alines): self.mysql.init_app(self.app) idBookStr = str(idbook) idUserStr = str(iduser) alinesStr = str(alines) query = ''' UPDATE `readings` SET `alines`=''' + alines + ''' WHERE `iduser` like ''' + idUserStr + ''' AND `idbook` like ''' + idBookStr try: con = self.mysql.connect() cur = con.cursor() cur.execute(query) con.commit() return True except: return False
def processData(self, only_cached_files=False, max_capacity=None): ''' Processes all posts and returns two dictionaries in a tuple. The first maps image name to hash, and the second maps image name to OCR results. The results will also be cached in memory within the class and will be used in other methods for checking reposts Returns: A tuple of two dictionaries, first one containing image name to hash mappings and second one containing image name to OCR readings. ''' if not only_cached_files: files = [ f for f in listdir(self.img_dir) if isfile(join(self.img_dir, f)) and not f.startswith('.') ] files.sort() self.readProcessedDataFromCache() else: self.readProcessedDataFromCache() files = list(self.__imageToHash.keys()) files.sort() if max_capacity is not None: files = files[:max_capacity] d = self.__imageToHash t = self.__imageToText self.vPrint("loading... " + str(len(files)) + ' items') for i, file in enumerate(files): if len(files) < 50 or i % (len(files) // 20) == 0: self.vPrint('partial: %5d/%d' % (i, len(files))) try: if file not in d or file not in t: img = Image.open(join(self.img_dir, file)) d[file] = Hasher.hashImage(img, self.__imagehash_method) t[file] = OCR.read2Normalized(img) except KeyboardInterrupt: self.vPrint('skipped remaining files') if file in d: del d[file] if file in t: del t[file] break except UnidentifiedImageError: self.vPrint('skipped ' + file + ' (not an image)') if file in d: del d[file] if file in t: del t[file] self.vPrint('loaded: ' + str(len(d.items())) + ' items') self.__imageToHash = d self.__imageToText = t self.saveProcessedDataToCache() return (d, t)
def __init__(self, size, keys): self._value = Bitset(size) self._size = size self._keys = keys self._hasher = Hasher()
def _derive_key(key, salt=get_random_bytes(32)): h = Hasher(10) return h.hash(key, salt)[-32:], salt
def setUp(self): self.hasher = Hasher("words.txt", nwords=3, delimeter="-")
class RequestURLCrawler: def __init__(self, init_dic): self.logger = getLogger() if not self.__isValidInfo(init_dic): self.logger.error( "Failed to init RequestURLCrawler : Invalid input information") exit(1) self.info_dic = init_dic self.cursor = None self.req_url_queue = [ ] # unvisited seeds (minimum heap ordered by page no.) # heappush(req_url_queue, (guid_hash, url_data)) self.url_data_dic = dict( ) # visited + fully parsed data, dic[view_guid_hash] = URLData() self.hasher = Hasher() self.url_factory = None self.html_parser = None self.xml_producer = XMLPrinter(OUTPUT_PATH) def __isValidInfo(self, init_dic): """ 크롤에 필요한 모든 정보가 들어왔는지 유효성 체크 :param init_dic: 크롤 정보 :return: valid 여부 """ if "request_urls" in input_dic: if input_dic["request_urls"]: return True elif ("url_table" in input_dic) and ("db_info_key" in input_dic): return True return False def run(self, _cursor=None): """ 크롤러를 처음 동작시키는 entry point """ self.cursor = _cursor if not self.cursor: self.cursor = getDBConnectionByName(self.info_dic["db_info_key"]) self.logger.info("Start RequestURLCrawler crawler!") self.loadRequestURLs() # 일정량만큼의 Request URL 추출 while self.req_url_queue: self.logger.info("Loaded [%s] view URLs", len(self.req_url_queue)) crawl_count = self.startCrawl() # req_url_queue 안의 URL 소모 self.logger.info("Crawled [%s] view URLs" % crawl_count) save_count = self.saveURLData() # 크롤한 View URL 전체 저장 + URL상태변경 self.logger.info("Saved [%s] view URLs", save_count) self.loadRequestURLs() self.logger.info("Finished total crawler!") if not _cursor: self.cursor.close() def loadRequestURLs(self, load_count=1000): """ 요청 URL을 일정량만큼 LOAD 하여 Queue에 채운다. """ if "request_urls" in input_dic: count = 0 while count <= load_count: req_url = input_dic["request_urls"].pop() url_info = self.url_factory.getGuid( req_url) # url_info 는 dict 타입 if url_info: if url_info["url_type"] == "view": guid_hash = self.hasher.md5(url_info["guid"]) url_data = URLData(guid_hash) url_data.data_dic.update(url_info) heappush(self.req_url_queue, (guid_hash, url_data)) count += 1 else: query = "SELECT url_md5, request_url FROM " + self.info_dic[ "url_table"] + " WHERE visited = 'N' ORDER BY request_time LIMIT %s" % load_count data_list = selectQuery(self.cursor, query, [self.info_dic["domain_id"], "N"]) for no, video_url, insert_time in data_list: url_info = self.url_factory.getGuid(video_url) if url_info: if url_info["url_type"] == "view": guid_hash = self.hasher.md5(url_info["guid"]) url_data = URLData(guid_hash) url_data.data_dic.update(url_info) heappush(self.req_url_queue, (guid_hash, url_data)) def startCrawl(self): """ queue의 URL을 하나씩 소모하며 파싱된 최종데이터 추출 """ count = 0 while self.req_url_queue: guid_hash, url_data = heappop(self.req_url_queue) self.visitURL(url_data) self.url_data_dic[guid_hash] = url_data count += 1 time.sleep(CRAWL_DELAY) return count def visitURL(self, url_data): """ URL 방문, 파싱하여 URL 데이터 생성 """ down_url = url_data["url_info"]["down_url"] down_data = downloadPage(down_url) if down_data: http_header, http_content, real_URL = down_data parse_result = self.html_parser.parse(http_header, http_content, real_URL) crawl_data_count = len(parse_result) if parse_result: url_data.data_dic.update(parse_result) self.logger.info(" Crawled URL [%s] data from URL [%s]" % (crawl_data_count, down_url)) def saveURLData(self): """ 추출한 View URL을 DB 및 File로 출력 """ # 1. Update flag from load table update_query = "UPDATE " + self.info_dic[ "url_table"] + " SET visited = 'Y' WHERE url_md5 = %s" save_count = 0 document_dic_list = [] for guid_hash, url_data in self.url_data_dic.items(): try: ret = executeQuery(self.cursor, update_query, [url_data.id]) document_dic_list.append(url_data.data_dic) save_count += 1 self.logger.info(" Updated URL %s : %s" % (ret, url_data.get("guid"))) except Exception, msg: self.logger.error(" Update Failed : %s : %s", url_data.get("guid"), msg) # 2. Save data into XML file self.xml_producer.printXML(document_dic_list) return save_count
def checkRepostDetection(self, img: str, img_sim_min: int = 0.8, text_sim_min: float = 0.7, recheck_img: bool = True, generate_repost: bool = False, save_generated_repost: bool = True): ''' Checks whether reposts can be detected correctly using a naive algorithm considering image hashes and ocr text. This assumes the dataset is correctly labelled such that a reposted image is the image name prefixed with _REPOST_. If an image is custom crafted and you don't want it to make a deduction of whether it's a true positive or otherwise, simply avoid using the standard format name of: <subreddit>_<postID>.<imgExtension> ''' distances = [] name_dist_dict = {} d = self.__imageToHash t = self.__imageToText target_check = img target_path = join(self.img_dir, target_check) target_img = None self.vPrint('we\'ll process post : ' + target_check) if generate_repost or recheck_img: target_img = Image.open(target_path) if target_img and (recheck_img or target_check not in d or target_check not in t): self.vPrint('computing target metadata') target_hash = Hasher.hashImage(target_img, self.__imagehash_method) target_text = OCR.read2Normalized(target_img) target_texthash = Hasher.hashText(target_text) d[target_check] = target_hash t[target_check] = target_text self.__imageToHash = d self.__imageToText = t else: target_hash = d[target_check] target_text = t[target_check] bad_check = '_REPOST_' + target_check if generate_repost: self.vPrint('generating dummy repost : _REPOST_' + target_check) bad_img = generate_bad_repost(target_path) bad_img_path = join(self.img_dir, bad_check) self.vPrint('computing target metadata') bad_img_hash = Hasher.hashImage(bad_img, self.__imagehash_method) bad_img_text = OCR.read2Normalized(bad_img) bad_img_texthash = Hasher.hashText(bad_img_text) d[bad_check] = bad_img_hash t[bad_check] = bad_img_text if save_generated_repost: bad_img.save(bad_img_path) self.__imageToHash = d self.__imageToText = t if self.update_cache: self.saveProcessedDataToCache() self.vPrint('\nchecking...') for key, value in d.items(): if key == target_check: continue img_diff = Hasher.diff(value, target_hash, 'IMAGE') text_sim = 0.0 if text_sim_min <= 0.0 else Levenshtein.ratio( t[key], target_text) distances.append \ ( \ (key, \ img_diff, \ text_sim) ) name_dist_dict[key] = (distances[-1][1], distances[-1][2]) def orderOfSort(x): '''dynamic sorting to prioritise text if image and text are both really close''' img_diff = x[1] txt_diff = 1 - x[2] if txt_diff <= 1 - text_sim_min and img_diff <= 1 - img_sim_min: return (txt_diff - 1, img_diff - 1) return (img_diff, txt_diff) distances.sort(key=orderOfSort) counter = 0 results = {} FP = 0 FN = 0 self.vPrint('--- similar results ---') self.vPrint(' SAME? | IMG_SIM | TEXT_SIM | IMAGE') for a, b, c in distances: standardFormat = len(a.split('.')) == 2 and len( a.split('.')[0].split('_REPOST_')[-1].split('_')) == 2 is_known_same = a.split('_REPOST_')[-1] == target_check.split( '_REPOST_')[-1] is_repost = b <= 1 - img_sim_min and c >= text_sim_min if not standardFormat: validity = '??' else: if is_known_same: if is_repost: validity = 'TP' else: validity = 'FN' FN += 1 else: if is_repost: validity = 'FP' FP += 1 else: validity = 'TN' if counter < 10: counter += 1 if self.verbose: self.vPrint('%8s %7.3f %8.3f %-50s' % \ (('YES, ' if is_repost else ' NO, ') + validity,1-b,c,a)) if standardFormat: subreddit = a.split('_REPOST_')[-1].split('_')[0] post_id = a.split('_REPOST_')[-1].split('_')[-1].split( '.')[0] self.vPrint('reddit.com/r/' + subreddit + '/comments/' + post_id + '/') else: self.vPrint( '• this image isn\'t from the standard dataset') if a == target_check: self.vPrint('• this is the originally chosen image') elif is_known_same: self.vPrint( '• this is a known to be the same as the chosen image' ) self.vPrint() results[a] = { 'imgName': a, 'isRepost': is_repost, 'validity': validity, 'imgDiff': b, 'textSim': c } if FP or FN: self.vPrint('important notes:') self.vPrint( 'we have %d known false positives and %d known false negatives for this\n' % (FP, FN)) return results
def __init_hasher__(): global __HASHER_START_X__ global __HASHER_START_Y__ hasher = Hasher(__HASHER_START_X__, __HASHER_START_Y__) add_hasher(hasher)
def main(): hasher = Hasher(args.wordfile, nwords=3, delimeter="-") pprint(hasher.process(args.input))