Пример #1
0
def insert_html_images(d):
    """Inserts an image into page images for html content types

    input
        d: ObjectId from pymongo

    output
        Boolean sucess indicator
    """
    client = py.MongoClient('mongo')
    db = client['docs']
    col = db['aug_meta']
    doc_id = d['_id']
    doc = col.find_one({"_id": doc_id})
    html_file = pu.get_from_gridfs(db, doc['raw_file'])
    f = tempfile.NamedTemporaryFile(mode='wb', delete=False, suffix='.html')
    f.write(html_file)
    f.flush()
    display = Display(visible=0, size=(800,600))
    display.start()
    jpgfile = imgkit.from_file(f.name, 'pageimg.jpg')
  #  display.stop()
    f.delete
    b = py.import_to_gridfs(db, 'pageimg.jpg', 'image')
    if 'page_images' not in doc:
        doc['page_images']=[]
    doc['page_images'].append(b)

    success = pu.update_doc(col, doc_id, doc)

    return success
Пример #2
0
def insert_content_type(d):
    """Insert content type.

    Inserts a standardized content type list in the
    top level of a document.
    Inputs:
        d: Returned ObjectId dictionary from pymongo find
    Output:
        Boolean sucess indictor
    """
    client = py.MongoClient('mongo')
    db = client['docs']
    col = db['aug_meta']

    doc_id = d['_id']
    doc = col.find_one({"_id": doc_id})
    c = doc['metadata']['Content-Type']
    content_type = standardize_content_type(c)
    doc['Content-Type'] = dict()
    doc['Content-Type']['Content'] = content_type[0]
    if len(content_type) == 2:
        doc['Content-Type']['Charset'] = content_type[1]
    success = pu.update_doc(col, doc_id, doc)

    return success
Пример #3
0
def insert_glove(d):
    """Insert document vectors.

    Inserts a document vector created from averaging glove vectors.
    Assumes the spacy model has been imported into this module's namspace
    as "model", to ensure that the model is only loaded once per container.
    Inputs:
        d: Returned ObjectId dictionary from pymongo find
    Output:
        Boolean sucess indictor
    """

    client = py.MongoClient('mongo')
    db = client['docs']
    col = db['aug_meta']

    doc_id = d['_id']
    doc = col.find_one({"_id": doc_id})
    text = doc['content']

    vec = generate_glove(text, model, 300)

    if 'ml-features' not in doc:
        doc['ml-features'] = dict()
    doc['ml-features']['glove'] = vec.tolist()
    success = pu.update_doc(col, doc_id, doc)

    return success
Пример #4
0
def insert_doc2vec(d):
    """Insert document vectors.

    Inserts a document vector created from aggregating word2vec vectors.
    Note the word2vec model needs to be loaded as "model" in this module's
    namespace. This is so that the model is not repeatedly loaded.
    Inputs:
        d: Returned ObjectId dictionary from pymongo find
    Output:
        Boolean sucess indictor
    """

    client = py.MongoClient('mongo')
    db = client['docs']
    col = db['aug_meta']

    doc_id = d['_id']
    doc = col.find_one({"_id": doc_id})
    text = doc['content']

    vec = generate_doc2vec(text, model, 300)

    if 'ml-features' not in doc:
        doc['ml-features'] = dict()
    doc['ml-features']['doc2vec'] = vec.tolist()
    success = pu.update_doc(col, doc_id, doc)

    return success
Пример #5
0
def insert_pdf_images(d):
    """Insert TIKA extracted metadata and content."""
    client = py.MongoClient('mongo')
    db = client['docs']
    col = db['aug_meta']

    temp_dir = tempfile.mkdtemp()

    doc_id = d['_id']
    doc = col.find_one({"_id": doc_id})
    pdf_file = pu.get_from_gridfs(db, doc['raw_file'])

    images = import_page_images(db, pdf_file, temp_dir, False)
    doc['page_images'] = images
    success = pu.update_doc(col, doc_id, doc)

    fl.clean_temp_files(temp_dir)

    return success
Пример #6
0
def insert_office_images(d):
    """Insert TIKA extracted metadata and content."""
    client = py.MongoClient('mongo')
    db = client['docs']
    col = db['aug_meta']

    temp_dir = tempfile.mkdtemp()

    doc_id = d['_id']
    doc = col.find_one({"_id": doc_id})
    pdf_file = pu.get_from_gridfs(db, doc['raw_file'])

    f = tempfile.NamedTemporaryFile(mode='wb', delete=False)
    f.write(pdf_file)
    images = import_page_images(db, f.name, temp_dir, True)
    doc['page_images'] = images
    success = pu.update_doc(col, doc_id, doc)

    fl.clean_temp_files(temp_dir, f.name)

    return success
Пример #7
0
def extract_postcode(d, b):
    """Extract Postcode."""
    client = py.MongoClient('mongo')
    db = client['docs']
    col = db['greenbook']

    session = aws.create_session()

    doc_id = d['_id']
    doc = col.find_one({"_id": doc_id})
    postcodes = find_ukpc(doc['text'])
    if postcodes:
        doc['postcode'] = postcodes
        pc_key = "testing/" + doc['key'] + "/postcodes.json"
        s3_pc = aws.write_dict_json(postcodes)
        write_s3 = aws.put_s3_object(session, b, pc_key, s3_pc)
        success = pu.update_doc(col, doc_id, doc)
    else:
        success = None

    return success