def insert_meta(b, f, c): """Insert TIKA extracted metadata and content.""" client = py.MongoClient('mongo') db = client['docs'] col = db[c] session = aws.create_session() doc_stream = aws.get_s3_object(session, b, f).get()["Body"].read() sha1 = fl.create_sha(doc_stream, True) meta_exists = [ x for x in col.find({"sha1": sha1}, { "key": True, "_id": False }) ] if meta_exists: doc = dict() doc['key'] = f doc['sha1'] = sha1 doc['uuid'] = fl.create_uuid() doc['duplicate'] = meta_exists[0]['key'] success = pu.create_doc(col, doc) return success doc = get_tika_content_stream(doc_stream) doc['key'] = f doc['sha1'] = sha1 doc['uuid'] = fl.create_uuid() if 'content' in doc: if doc['content'] != "": s3_txt = aws.write_dict_json(doc['content']) write_s3_txt = aws.put_s3_object(session, b + "-writable", doc['key'] + "/extracted.json", s3_txt) doc['content'] = True else: doc.pop('content', None) if 'metadata' in doc: s3_meta = aws.write_dict_json(doc['metadata']) write_s3_meta = aws.put_s3_object(session, b + "-writable", doc['key'] + "/metadata.json", s3_meta) if 'attachments' in doc: if doc['attachments'] != []: doc['no_attach'] = len(doc['attachments']) attachments = doc['attachments'] doc['attachments'] = [ insert_attachments_meta(db, attachments.get(x), x, c, b, f, session) for x in attachments ] success = pu.create_doc(col, doc) return success
def create_document(b, k): """Create mongo document. Creates a mongo document from a s3 json file. Input: f: S3 key """ client = py.MongoClient('mongo') db = client['docs'] col = db['greenbook'] session = aws.create_session() s3_obj = aws.get_s3_object(session, b, k) success = pu.create_doc(col, aws.read_s3_json(s3_obj)) return success
def extract_postcode(d, b): """Extract Postcode.""" client = py.MongoClient('mongo') db = client['docs'] col = db['greenbook'] session = aws.create_session() doc_id = d['_id'] doc = col.find_one({"_id": doc_id}) postcodes = find_ukpc(doc['text']) if postcodes: doc['postcode'] = postcodes pc_key = "testing/" + doc['key'] + "/postcodes.json" s3_pc = aws.write_dict_json(postcodes) write_s3 = aws.put_s3_object(session, b, pc_key, s3_pc) success = pu.update_doc(col, doc_id, doc) else: success = None return success
import boto3 import os from rq import Queue from redis import Redis import docproc.awsutil as aws from docproc.mgtika import insert_meta import time start_time = time.time() redis_conn = Redis(host='redis') q = Queue(connection=redis_conn) session = aws.create_session() rd = os.environ['S3_READ_BUCKET'] wt = os.environ['S3_WRITE_BUCKET'] rd_path = os.environ['S3_READ_PATH'] col = os.environ['COLLECTION'] rd_bucket = aws.get_s3_bucket(session, rd) jobs = 0 runs = 1 for run in range(runs): for fl in rd_bucket.objects.filter(Prefix=rd_path): job = q.enqueue(insert_meta, rd, fl.key, col) jobs = jobs + 1 print("Submitted: " + str(jobs) + " jobs") elapsed_time = time.time() - start_time print("submission time:", elapsed_time)