Exemplo n.º 1
0
def upload_text():
    data = request.get_json(force=True)
    text = data.get('text', None)
    doc_id = data.get('doc_id', None)
    if text is None:
        ret = {'failed': 1, 'error': 'No text field in json'}
        return jsonify(ret)

    date = datetime.datetime.now()
    if doc_id is None:
        doc_id = uuid.uuid4().hex

    lang = detect_lang(text)
    vec = doc2vec(text)
    doc = {
        "doc_id": doc_id,
        "has_image": False,
        "has_text": True,
        "date_added": date,
        "date_updated": date,
        "tags": [],
        "text": text,
        "lang": lang,
    }
    if vec is not None:
        doc["vec"] = vec

    db.docs.insert_one(doc)
    ret = {'failed': 0, 'doc_id': doc_id}
    return jsonify(ret)
Exemplo n.º 2
0
def upload_image():
    data = request.get_json(force=True)
    image_url = data.get('image_url')
    doc_id = data.get('doc_id', None)
    source = data.get('source', 'tattle-admin')
    if image_url is None:
        ret = {'failed': 1, 'error': 'No image_url found'}
    else:
        image_dict = image_from_url(image_url)
        image = image_dict['image']
        image = image.convert('RGB')  #take care of png(RGBA) issue
        image_vec = resnet18.extract_feature(image)

        detected_text = detect_text(image_dict['image_bytes']).get('text', '')
        lang = detect_lang(detected_text)

        #import ipdb; ipdb.set_trace()
        if detected_text == '' or None:
            text_vec = np.zeros(300).tolist()
            has_text = False
        else:
            text_vec = doc2vec(detected_text)
            has_text = True

        if lang is None:
            text_vec = np.zeros(300).tolist()
            has_text = True

        if text_vec is None:
            text_vec = np.zeros(300).tolist()
            has_text = True

        vec = np.hstack((image_vec, text_vec)).tolist()

        date = datetime.datetime.now()
        if doc_id is None:
            doc_id = uuid.uuid4().int
        db.docs.insert_one({
            "doc_id": doc_id,
            "source": source,
            "version": "1.1",
            "has_image": True,
            "has_text": has_text,
            "text": detected_text,
            "tags": [],
            "date_added": date,
            "date_updated": date,
            "image_vec": image_vec.tolist(),
            "text_vec": text_vec,
            "vec": vec,
        })
        ret = {'doc_id': doc_id, 'failed': 0}

        #update the search index
        imagesearch.update(doc_id, image_vec)
        docsearch.update(doc_id, vec)
        if has_text:
            textsearch.update(doc_id, text_vec)

    return jsonify(ret)
Exemplo n.º 3
0
def find_duplicate():
    data = request.get_json(force=True)
    text = data.get('text', None)
    thresh = data.get('threshold')
    image_url = data.get('image_url', None)
    if text is None and image_url is None:
        ret = {'failed': 1, 'error': 'No text or image_url found'}

    elif image_url is not None:
        image_dict = image_from_url(image_url)
        image = image_dict['image']
        image = image.convert('RGB')  #take care of png(RGBA) issue
        vec = resnet18.extract_feature(image)
        if thresh:
            doc_id, dist = imagesearch.search(vec, thresh)
        else:
            doc_id, dist = imagesearch.search(vec)

        if doc_id is not None:
            ret = {
                'failed': 0,
                'duplicate': 1,
                'doc_id': doc_id,
                'distance': dist
            }
        else:
            ret = {'failed': 0, 'duplicate': 0}

    elif text is not None:
        duplicate_doc = db.docs.find_one({"text": text})
        vec = doc2vec(text)
        if thresh:
            doc_id, dist = textsearch.search(vec, thresh)
        else:
            doc_id, dist = textsearch.search(vec)
        if duplicate_doc is not None:
            ret = {
                'failed': 0,
                'duplicate': 1,
                'doc_id': duplicate_doc.get('doc_id')
            }
        elif doc_id is not None:
            ret = {
                'failed': 0,
                'duplicate': 1,
                'doc_id': doc_id,
                'distance': dist
            }
        else:
            ret = {'failed': 0, 'duplicate': 0}

    else:
        ret = {'failed': 1, 'error': 'something went wrong'}

    return jsonify(ret)
Exemplo n.º 4
0
def s3ToDB(objs, url_prefix, img_model, docs):
    from analyzer import image_from_url, doc2vec

    for f in objs['Contents']:
        url = url_prefix + f['Key']
        # urls += [url]

        content_type = requests.get(url).headers['Content-Type']
        print(f['Key'], content_type)
        # better check for content-type
        if content_type[:5] == 'image':
            try:
                # fails with pngs
                img = image_from_url(url)
                img_bytes = img['image']
                image_vec = img_model.extract_feature(img_bytes)

                doc = default_db_doc(
                    has_image=True, image_vec=image_vec.tolist())
                docs.insert_one(doc)
            except Exception as e:
                print('error', e)
                continue

            print('added image: ', doc['doc_id'])

        elif content_type[:4] == 'text':
            text = requests.get(url).text
            if len(text) == 0:
                continue

            textvec, lang = doc2vec(text)

            doc = default_db_doc(has_text=True, text=text,
                                 lang=lang, text_vec=textvec.tolist())
            docs.insert_one(doc)

            print('added text: ', doc['doc_id'])
Exemplo n.º 5
0
    def build(self):
        if self.db_type == 'mongo':
            db = mongoDB()
            docs = db.docs

            cur = docs.find({"has_text": True})
            total_docs = docs.count_documents({"has_text": True})
            for doc in tqdm(cur, total=total_docs):
                if doc.get('text_vec') is None:
                    continue
                self.ids.append(doc.get('doc_id'))
                self.vecs.append(doc.get('text_vec'))
        elif self.db_type == 'sqlite':
            db = sqlDatabase(self.db_filename)
            # TODO: a nicer way to get count
            total_docs = db.query("SELECT COUNT(doc_id) from documents")[0][0]
            cur = db.query(
                "SELECT doc_id, vec from documents where vec != 'null'")
            for doc in tqdm(cur, total=total_docs):
                self.ids.append(doc[0])
                self.vecs.append(doc[1])
        elif self.db_type == 'testing':
            """setup up local testing dataset

            db_filename {list(tuple)}: [(doc_id, vec),...]
            """
            for i, doc in self.db_filename:
                vec, lang = doc2vec(doc)
                if lang is None:
                    # bad example doc
                    print(f'{vec}: doc no. {i}')
                    continue

                # insert template doc into search set
                self.ids.append(i)
                self.vecs.append(vec.tolist())

        self.vecs = np.array(self.vecs)
Exemplo n.º 6
0
def create_sql_db(filename='docs_sqlite_db.db'):
    from analyzer import doc2vec, ResNet18
    from PIL import Image

    db = sqlDatabase(filename)

    # https://www.tutorialspoint.com/sqlite/sqlite_data_types.htm
    insert_table_query = 'CREATE TABLE documents (doc_id integer primary key, has_image int, has_text int, date_added text, date_updated text, tags text, textdata text, lang text, vec array, imagedata blob, imagemetadata text, imagevec array)'
    db.execute(insert_table_query)

    texts_folder = listdir('tests/texts/')
    images_folder = listdir('tests/images/')

    # defaults
    has_text = 1
    has_image = 0
    date_added = date.today()
    date_updated = date.today()
    tags = None
    imagedata = None
    imagemetadata = None
    imagevec = None

    for file in texts_folder:
        with open('tests/texts/' + file, 'r') as f:
            textdata = f.read()
            if len(textdata) == 0:
                continue
            vec, lang = doc2vec(textdata, 'word2vec/word2vec.db')

            data = (has_text, has_image, date_added, date_updated,
                    tags, textdata, lang, vec, imagedata, imagemetadata, imagevec)

        db.execute('INSERT into documents(has_image, has_text, date_added, date_updated, tags, textdata, lang, vec, imagedata, imagemetadata, imagevec) values(?,?,?,?,?,?,?,?,?,?,?)', data)

    # defaults
    has_text = 0
    has_image = 1
    date_added = date.today()
    date_updated = date.today()
    tags = None
    textdata = None
    lang = None
    vec = None

    model = ResNet18()

    for file in images_folder:
        img = Image.open('tests/images/' + file)
        # assert(type(img) == Image.Image)
        imagedata = img.tobytes()
        imagemetadata = str({'mode': img.mode, 'size': img.size})
        imagevec = model.extract_feature(img)

        data = (has_text, has_image, date_added, date_updated,
                tags, textdata, lang, vec, imagedata, imagemetadata, imagevec)

        db.execute('INSERT into documents(has_image, has_text, date_added, date_updated, tags, textdata, lang, vec, imagedata, imagemetadata, imagevec) values(?,?,?,?,?,?,?,?,?,?,?)', data)

    db.commit()
    db._conn.close()
Exemplo n.º 7
0
def find_duplicate():
    data = request.get_json(force=True)
    text = data.get('text', None)
    thresh = data.get('threshold')
    sources = data.get('sources', [])
    image_url = data.get('image_url', None)
    if text is None and image_url is None:
        ret = {'failed': 1, 'error': 'No text or image_url found'}

    elif image_url is not None:
        image_dict = image_from_url(image_url)
        image = image_dict['image']
        image = image.convert('RGB')  #take care of png(RGBA) issue
        vec = resnet18.extract_feature(image)
        if thresh:
            doc_ids, dists = imagesearch.search(vec, thresh)
        else:
            doc_ids, dists = imagesearch.search(vec)
        sources = {
            d.get('doc_id'): d.get('source')
            for d in db.docs.find({"doc_id": {
                "$in": doc_ids
            }})
        }

        if doc_ids is not None:
            result = [{
                'doc_id': doc_ids[i],
                'dist': dists[i],
                'source': sources[doc_ids[i]]
            } for i in range(min(10, len(doc_ids)))]
            ret = {'failed': 0, 'result': result}
        else:
            ret = {'failed': 0, 'result': []}

    elif text is not None:
        duplicate_doc = db.docs.find_one({"text": text})
        vec = doc2vec(text)
        if vec is None:
            ret = {'failed': 1, 'error': 'query words not found in db'}
        doc_ids, dists = textsearch.search(vec)
        sources = {
            d.get('doc_id'): d.get('source')
            for d in db.docs.find({"doc_id": {
                "$in": doc_ids
            }})
        }

        if doc_ids is not None:
            result = [{
                'doc_id': doc_ids[i],
                'dist': dists[i],
                'source': sources[doc_ids[i]]
            } for i in range(min(10, len(doc_ids)))]
        else:
            result = []

        if duplicate_doc is not None:
            result = [{
                'doc_id': duplicate_doc.get('doc_id'),
                'dist': 0.0,
                'source': duplicate_doc.get('source')
            }] + result

        ret = {'failed': 0, 'duplicate': 1, 'result': result}

    else:
        ret = {'failed': 1, 'error': 'something went wrong'}

    return jsonify(ret)