def upload_text(): data = request.get_json(force=True) text = data.get('text', None) doc_id = data.get('doc_id', None) if text is None: ret = {'failed': 1, 'error': 'No text field in json'} return jsonify(ret) date = datetime.datetime.now() if doc_id is None: doc_id = uuid.uuid4().hex lang = detect_lang(text) vec = doc2vec(text) doc = { "doc_id": doc_id, "has_image": False, "has_text": True, "date_added": date, "date_updated": date, "tags": [], "text": text, "lang": lang, } if vec is not None: doc["vec"] = vec db.docs.insert_one(doc) ret = {'failed': 0, 'doc_id': doc_id} return jsonify(ret)
def upload_image(): data = request.get_json(force=True) image_url = data.get('image_url') doc_id = data.get('doc_id', None) source = data.get('source', 'tattle-admin') if image_url is None: ret = {'failed': 1, 'error': 'No image_url found'} else: image_dict = image_from_url(image_url) image = image_dict['image'] image = image.convert('RGB') #take care of png(RGBA) issue image_vec = resnet18.extract_feature(image) detected_text = detect_text(image_dict['image_bytes']).get('text', '') lang = detect_lang(detected_text) #import ipdb; ipdb.set_trace() if detected_text == '' or None: text_vec = np.zeros(300).tolist() has_text = False else: text_vec = doc2vec(detected_text) has_text = True if lang is None: text_vec = np.zeros(300).tolist() has_text = True if text_vec is None: text_vec = np.zeros(300).tolist() has_text = True vec = np.hstack((image_vec, text_vec)).tolist() date = datetime.datetime.now() if doc_id is None: doc_id = uuid.uuid4().int db.docs.insert_one({ "doc_id": doc_id, "source": source, "version": "1.1", "has_image": True, "has_text": has_text, "text": detected_text, "tags": [], "date_added": date, "date_updated": date, "image_vec": image_vec.tolist(), "text_vec": text_vec, "vec": vec, }) ret = {'doc_id': doc_id, 'failed': 0} #update the search index imagesearch.update(doc_id, image_vec) docsearch.update(doc_id, vec) if has_text: textsearch.update(doc_id, text_vec) return jsonify(ret)
def find_duplicate(): data = request.get_json(force=True) text = data.get('text', None) thresh = data.get('threshold') image_url = data.get('image_url', None) if text is None and image_url is None: ret = {'failed': 1, 'error': 'No text or image_url found'} elif image_url is not None: image_dict = image_from_url(image_url) image = image_dict['image'] image = image.convert('RGB') #take care of png(RGBA) issue vec = resnet18.extract_feature(image) if thresh: doc_id, dist = imagesearch.search(vec, thresh) else: doc_id, dist = imagesearch.search(vec) if doc_id is not None: ret = { 'failed': 0, 'duplicate': 1, 'doc_id': doc_id, 'distance': dist } else: ret = {'failed': 0, 'duplicate': 0} elif text is not None: duplicate_doc = db.docs.find_one({"text": text}) vec = doc2vec(text) if thresh: doc_id, dist = textsearch.search(vec, thresh) else: doc_id, dist = textsearch.search(vec) if duplicate_doc is not None: ret = { 'failed': 0, 'duplicate': 1, 'doc_id': duplicate_doc.get('doc_id') } elif doc_id is not None: ret = { 'failed': 0, 'duplicate': 1, 'doc_id': doc_id, 'distance': dist } else: ret = {'failed': 0, 'duplicate': 0} else: ret = {'failed': 1, 'error': 'something went wrong'} return jsonify(ret)
def s3ToDB(objs, url_prefix, img_model, docs): from analyzer import image_from_url, doc2vec for f in objs['Contents']: url = url_prefix + f['Key'] # urls += [url] content_type = requests.get(url).headers['Content-Type'] print(f['Key'], content_type) # better check for content-type if content_type[:5] == 'image': try: # fails with pngs img = image_from_url(url) img_bytes = img['image'] image_vec = img_model.extract_feature(img_bytes) doc = default_db_doc( has_image=True, image_vec=image_vec.tolist()) docs.insert_one(doc) except Exception as e: print('error', e) continue print('added image: ', doc['doc_id']) elif content_type[:4] == 'text': text = requests.get(url).text if len(text) == 0: continue textvec, lang = doc2vec(text) doc = default_db_doc(has_text=True, text=text, lang=lang, text_vec=textvec.tolist()) docs.insert_one(doc) print('added text: ', doc['doc_id'])
def build(self): if self.db_type == 'mongo': db = mongoDB() docs = db.docs cur = docs.find({"has_text": True}) total_docs = docs.count_documents({"has_text": True}) for doc in tqdm(cur, total=total_docs): if doc.get('text_vec') is None: continue self.ids.append(doc.get('doc_id')) self.vecs.append(doc.get('text_vec')) elif self.db_type == 'sqlite': db = sqlDatabase(self.db_filename) # TODO: a nicer way to get count total_docs = db.query("SELECT COUNT(doc_id) from documents")[0][0] cur = db.query( "SELECT doc_id, vec from documents where vec != 'null'") for doc in tqdm(cur, total=total_docs): self.ids.append(doc[0]) self.vecs.append(doc[1]) elif self.db_type == 'testing': """setup up local testing dataset db_filename {list(tuple)}: [(doc_id, vec),...] """ for i, doc in self.db_filename: vec, lang = doc2vec(doc) if lang is None: # bad example doc print(f'{vec}: doc no. {i}') continue # insert template doc into search set self.ids.append(i) self.vecs.append(vec.tolist()) self.vecs = np.array(self.vecs)
def create_sql_db(filename='docs_sqlite_db.db'): from analyzer import doc2vec, ResNet18 from PIL import Image db = sqlDatabase(filename) # https://www.tutorialspoint.com/sqlite/sqlite_data_types.htm insert_table_query = 'CREATE TABLE documents (doc_id integer primary key, has_image int, has_text int, date_added text, date_updated text, tags text, textdata text, lang text, vec array, imagedata blob, imagemetadata text, imagevec array)' db.execute(insert_table_query) texts_folder = listdir('tests/texts/') images_folder = listdir('tests/images/') # defaults has_text = 1 has_image = 0 date_added = date.today() date_updated = date.today() tags = None imagedata = None imagemetadata = None imagevec = None for file in texts_folder: with open('tests/texts/' + file, 'r') as f: textdata = f.read() if len(textdata) == 0: continue vec, lang = doc2vec(textdata, 'word2vec/word2vec.db') data = (has_text, has_image, date_added, date_updated, tags, textdata, lang, vec, imagedata, imagemetadata, imagevec) db.execute('INSERT into documents(has_image, has_text, date_added, date_updated, tags, textdata, lang, vec, imagedata, imagemetadata, imagevec) values(?,?,?,?,?,?,?,?,?,?,?)', data) # defaults has_text = 0 has_image = 1 date_added = date.today() date_updated = date.today() tags = None textdata = None lang = None vec = None model = ResNet18() for file in images_folder: img = Image.open('tests/images/' + file) # assert(type(img) == Image.Image) imagedata = img.tobytes() imagemetadata = str({'mode': img.mode, 'size': img.size}) imagevec = model.extract_feature(img) data = (has_text, has_image, date_added, date_updated, tags, textdata, lang, vec, imagedata, imagemetadata, imagevec) db.execute('INSERT into documents(has_image, has_text, date_added, date_updated, tags, textdata, lang, vec, imagedata, imagemetadata, imagevec) values(?,?,?,?,?,?,?,?,?,?,?)', data) db.commit() db._conn.close()
def find_duplicate(): data = request.get_json(force=True) text = data.get('text', None) thresh = data.get('threshold') sources = data.get('sources', []) image_url = data.get('image_url', None) if text is None and image_url is None: ret = {'failed': 1, 'error': 'No text or image_url found'} elif image_url is not None: image_dict = image_from_url(image_url) image = image_dict['image'] image = image.convert('RGB') #take care of png(RGBA) issue vec = resnet18.extract_feature(image) if thresh: doc_ids, dists = imagesearch.search(vec, thresh) else: doc_ids, dists = imagesearch.search(vec) sources = { d.get('doc_id'): d.get('source') for d in db.docs.find({"doc_id": { "$in": doc_ids }}) } if doc_ids is not None: result = [{ 'doc_id': doc_ids[i], 'dist': dists[i], 'source': sources[doc_ids[i]] } for i in range(min(10, len(doc_ids)))] ret = {'failed': 0, 'result': result} else: ret = {'failed': 0, 'result': []} elif text is not None: duplicate_doc = db.docs.find_one({"text": text}) vec = doc2vec(text) if vec is None: ret = {'failed': 1, 'error': 'query words not found in db'} doc_ids, dists = textsearch.search(vec) sources = { d.get('doc_id'): d.get('source') for d in db.docs.find({"doc_id": { "$in": doc_ids }}) } if doc_ids is not None: result = [{ 'doc_id': doc_ids[i], 'dist': dists[i], 'source': sources[doc_ids[i]] } for i in range(min(10, len(doc_ids)))] else: result = [] if duplicate_doc is not None: result = [{ 'doc_id': duplicate_doc.get('doc_id'), 'dist': 0.0, 'source': duplicate_doc.get('source') }] + result ret = {'failed': 0, 'duplicate': 1, 'result': result} else: ret = {'failed': 1, 'error': 'something went wrong'} return jsonify(ret)