class TestCharterAnalyse(unittest.TestCase): @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_get_org_names(self): parser = CharterParser() audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs = get_docs_by_audit_id(audit_id, kind=CHARTER) for db_document in docs: print(db_document['filename']) parsed_p_json = db_document['parse'] charter: CharterDocument = join_paragraphs( parsed_p_json, doc_id=db_document['_id']) # TODO: mind, this could be slow if embedding is required parser.find_org_date_number(charter, AuditContext()) for tag in charter.get_tags(): print(tag)
def _test_convert(): # charter: 5f64161009d100a445b7b0d6 # protocol: 5ded4e214ddc27bcf92dd6cc # contract: 5f0bb4bd138e9184feef1fa8 db = get_mongodb_connection() # a = doc['user']['attributes'] _test_protocol() # j, json_str, doc = test_protocol() # validate(instance=json_str, schema=document_schemas, format_checker=FormatChecker()) # db["documents"].update_one({'_id': doc["_id"]}, {"$set": {"analysis.attributes_tree": j}}) j, json_str, doc = _test_charter() validate(instance=json_str, schema=document_schemas, format_checker=FormatChecker()) db["documents"].update_one({'_id': doc["_id"]}, {"$set": { "analysis.attributes_tree": j }}) j, json_str, doc = _test_contract() validate(instance=json_str, schema=document_schemas, format_checker=FormatChecker()) db["documents"].update_one({'_id': doc["_id"]}, {"$set": { "analysis.attributes_tree": j }})
def get_audits(): db = get_mongodb_connection() audits_collection = db['audits'] res = audits_collection.find({ 'status': 'Finalizing' }).sort([("createDate", pymongo.ASCENDING)]) return res
def read_all_docs(files_dir: str, doc_type='CONTRACT'): db = get_mongodb_connection() collection = db['legaldocs'] wp = WordDocParser() filenames = wp.list_filenames(files_dir) cnt = 0 failures = 0 unknowns = 0 nodate = 0 def stats(): print(f'processed:{cnt};\t failures:\t{failures}\t unknown type: {unknowns}\t unknown date: {nodate}') for fn in filenames: shortfn = fn.split('/')[-1] pth = '/'.join(fn.split('/')[5:-1]) _doc_id = pth + '/' + shortfn cnt += 1 print(cnt, fn) docs = collection.find_one({"_id": _doc_id, 'version': wp.version}) if docs is None: # parse and save to DB try: docs = wp.read_doc(fn) # for res in docs['documents']: docs['short_filename'] = shortfn docs['path'] = pth docs['_id'] = _doc_id collection.delete_many({"_id": _doc_id}) collection.insert_one(docs) except Exception: print(f"{fn}\nException in WordDocParser code:") traceback.print_exc(file=sys.stdout) failures += 1 if docs: for res in docs['documents']: if doc_type == res["documentType"]: _doc = _parse_doc(res, _doc_id) yield _doc # if False: # contract: ContractDocument = _parse_contract(_doc, row) # json_struct = DocumentJson(contract).__dict__ # # if contracts_collection.find_one({"_id": _doc_id}) is None: # contracts_collection.insert_one(json_struct) stats()
def get_updated_contracts(self): self.lastdate = datetime(1900, 1, 1) if len(self.stats) > 0: # self.stats.sort_values(["user_correction_date", 'analyze_date', 'export_date'], inplace=True, ascending=False) self.lastdate = self.stats[[ "user_correction_date", 'analyze_date' ]].max().max() logger.info(f'latest export_date: [{self.lastdate}]') logger.debug('obtaining DB connection...') db = get_mongodb_connection() documents_collection = db['documents'] # TODO: filter by version query = { '$and': [{ "parse.documentType": "CONTRACT" }, { "state": 15 }, { '$or': [{ "analysis.attributes": { "$ne": None } }, { "user.attributes": { "$ne": None } }] }, { '$or': [{ 'analysis.analyze_timestamp': { '$gt': self.lastdate } }, { 'user.updateDate': { '$gt': self.lastdate } }] }] } logger.debug(f'running DB query {query}') # TODO: sorting fails in MONGO sorting = [('analysis.analyze_timestamp', ASCENDING), ('user.updateDate', ASCENDING)] # sorting = None res = documents_collection.find(filter=query, sort=sorting, projection={'_id': True}) res.limit(600) logger.info('running DB query: DONE') return res
def save_violations(audit, violations): db = get_mongodb_connection() db["audits"].update_one({'_id': audit["_id"]}, {"$set": { "violations": violations }}) db["audits"].update_one({'_id': audit["_id"]}, {"$set": { "status": "Done" }})
def test_doc_parser(self): db = get_mongodb_connection() if db is None: # TODO: this is a weird way of detecting we're on CI return FILENAME = "/Users/artem/work/nemo/goil/IN/Другие договоры/Договор Формула.docx" wp = WordDocParser() res = wp.read_doc(FILENAME) doc: LegalDocument = LegalDocument('') doc.parse() last = 0 for d in res['documents']: for p in d['paragraphs']: header_text = p['paragraphHeader']['text'] + '\n' body_text = p['paragraphBody']['text'] + '\n' header = LegalDocument(header_text) header.parse() # self.assertEqual(self.n(header_text), header.text) doc += header headerspan = (last, len(doc.tokens_map)) print(headerspan) last = len(doc.tokens_map) body = LegalDocument(body_text) body.parse() doc += body bodyspan = (last, len(doc.tokens_map)) header_tag = SemanticTag('headline', header_text, headerspan) body_tag = SemanticTag('paragraphBody', None, bodyspan) print(header_tag) # print(body_tag) para = Paragraph(header_tag, body_tag) doc.paragraphs.append(para) last = len(doc.tokens_map) h_subdoc = doc.subdoc_slice(para.header.as_slice()) b_subdoc = doc.subdoc_slice(para.body.as_slice()) # self.assertEqual(self.n(header_text), h_subdoc.text) # self.assertEqual(self.n(body_text), b_subdoc.text) print('-' * 100) print(doc.text) headers = [ doc.subdoc_slice(p.header.as_slice()) for p in doc.paragraphs ] print('-' * 100)
def get_audits() -> [dict]: db = get_mongodb_connection() audits_collection = db['audits'] cursor = audits_collection.find({ 'status': 'InWork' }).sort([("createDate", pymongo.ASCENDING)]) res = [] for audit in cursor: res.append(audit) return res
def get_attributes_tree(id: str): # x = json.loads(data, object_hook=lambda d: SimpleNamespace(**d)) # print(x.name, x.hometown.name, x.hometown.id) db = get_mongodb_connection() doc = get_doc_by_id(ObjectId(id)) analysis = doc.get('analysis') if analysis: tree = analysis.get('attributes_tree') r = dotdict(tree) return r.charter
def add_link(audit_id, doc_id1, doc_id2): db = get_mongodb_connection() audit_collection = db['audits'] audit_collection.update_one({"_id": audit_id}, { "$push": { "links": { "fromId": doc_id1, "toId": doc_id2, "type": "analysis" } } })
def save_analysis(db_document: DbJsonDoc, doc: LegalDocument, state: int, retry_number: int = 0): # TODO: does not save attributes analyse_json_obj: dict = doc.to_json_obj() db = get_mongodb_connection() documents_collection = db['documents'] db_document.analysis = analyse_json_obj db_document.state = state db_document.retry_number = retry_number documents_collection.update({'_id': doc.get_id()}, db_document.as_dict(), True)
def _get_doc_from_db(self, kind): audits = get_mongodb_connection()['audits'].find().sort([ ("createDate", pymongo.ASCENDING) ]).limit(1) for audit in audits: doc_ids = get_docs_by_audit_id(audit['_id'], kind=kind, states=[15], id_only=True) if len(doc_ids) > 0: print(doc_ids[0]) doc = finalizer.get_doc_by_id(doc_ids[0]) # jdoc = DbJsonDoc(doc) yield doc
def get_docs_by_audit_id(id: str, state, kind=None, id_only=False, without_large_fields=False): db = get_mongodb_connection() documents_collection = db['documents'] query = { 'auditId': id, 'parse.documentType': kind, "state": state, "$or": [{ "$and": [{ "analysis.attributes.date": { "$ne": None } }, { "user": None }] }, { "user.attributes.date": { "$ne": None } }] } if id_only: res = documents_collection.find(query, projection={'_id': True}) else: if without_large_fields: res = documents_collection.find(query, projection={ 'analysis.original_text': False, 'analysis.normal_text': False, 'analysis.tokenization_maps': False, 'analysis.headers': False, 'parse.paragraphs': False }) else: res = documents_collection.find(query) docs = [] for doc in res: docs.append(doc) return docs
def remove_old_links(audit_id, contract_id): db = get_mongodb_connection() audit_collection = db['audits'] audit_collection.update_one({"_id": audit_id}, { "$pull": { "links": { "type": "analysis", "$or": [{ "toId": contract_id }, { "fromId": contract_id }] } } })
def get_docs_by_audit_id(id: str or None, states=None, kind=None, id_only=False) -> []: db = get_mongodb_connection() documents_collection = db['documents'] query = { "$and": [ { 'auditId': id }, { "parserResponseCode": 200 }, { "$or": [ { "analysis.version": None }, # {"analysis.version": {"$ne": analyser.__version__}}, { "state": None } ] } ] } if states is not None: for state in states: query["$and"][2]["$or"].append({"state": state}) if kind is not None: query["$and"].append({'parse.documentType': kind}) if id_only: cursor = documents_collection.find(query, projection={'_id': True}) else: cursor = documents_collection.find(query) res = [] for doc in cursor: if id_only: res.append(doc["_id"]) else: res.append(doc) return res
def find_top_headers(): db = get_mongodb_connection() headers_collection = db['headers'] q = { 'count': { '$gt': 10 } } items = [] for c in headers_collection.find(q).sort('count', pymongo.DESCENDING): items.append({ 'text': c['text'], 'count': c['count'], 'doc_id': c['doc_id'] }) print(c['count'], '\t', c['text']) df = DataFrame.from_records(items) df.to_csv('top_headers.csv')
def dump_contracts_from_db_to_jsons(output_path): db = get_mongodb_connection() collection = db['legaldocs'] wp = WordDocParser() filenames = wp.list_filenames('/Users/artem/Downloads/Telegram Desktop/X0/') for fn in filenames: print(fn) shortfn = fn.split('/')[-1] pth = '/'.join(fn.split('/')[5:-1]) _doc_id = pth + '/' + shortfn res = collection.find_one({"_id": _doc_id}) if res is not None: json_name = _doc_id.replace('/', '_') with open(f'{output_path}/{json_name}.json', 'w') as file: _j = json.dumps(res, indent=4, ensure_ascii=False, default=lambda o: '<not serializable>') file.write(_j) print(f'saved file to {json_name}')
def analyse_headers(): db = get_mongodb_connection() collection = db['legaldocs'] headers_collection = db['headers'] headers_collection.drop() headers_collection = db['headers'] res = collection.find({}) k = 0 for doc in res: print(k) k += 1 for p in doc['paragraphs']: header_text = p['paragraphHeader']['text'] tokens = TOKENIZER_DEFAULT.tokenize(header_text) _, span, _, _ = get_tokenized_line_number(tokens, 0) header_id = ' '.join(tokens[span[1]:]) header_id = header_id.lower() # header_id = header_text existing = headers_collection.find_one({'text': header_id}) if existing: existing['count'] += 1 headers_collection.update({'_id': existing['_id']}, existing, True) pass else: header = { # '_id': header_id, 'doc_id': doc['_id'], 'text': header_id, 'len': len(header_text), 'has_newlines': header_text.count('\n'), 'count': 1 } headers_collection.insert_one(header) # headers_collection.update_one({'_id': header_id}, header, True) """
def update_db_dictionaries(): db = get_mongodb_connection() coll = db["subsidiaries"] coll.delete_many({}) coll.insert_many(subsidiaries) coll = db["orgStructuralLevel"] coll.delete_many({}) coll.insert_many(OrgStructuralLevel.as_db_json()) coll = db["legalEntityTypes"] coll.delete_many({}) coll.insert_many(legal_entity_types_as_db_json()) coll = db["contractSubjects"] coll.delete_many({}) coll.insert_many(contract_subject_as_db_json()) coll = db["analyser"] coll.delete_many({}) coll.insert_one({'version': analyser.__version__}) # indexing print('creating db indices') coll = db["documents"] resp = coll.create_index([("analysis.analyze_timestamp", DESCENDING)]) print("index response:", resp) resp = coll.create_index([("user.updateDate", DESCENDING)]) print("index response:", resp) resp = coll.create_index([("analysis.attributes.date.value", DESCENDING)]) print("index response:", resp) coll = db["documents"] sorting = [('analysis.analyze_timestamp', ASCENDING), ('user.updateDate', ASCENDING)] resp = coll.create_index(sorting) print("index response:", resp)
def convert_all_docs(): ids = get_legacy_docs_ids() if should_i_migrate(ids): db = get_mongodb_connection() documents_collection = db['documents'] for id in ids: doc = documents_collection.find_one({"_id": id}, projection={ '_id': True, 'analysis.attributes': True, 'user.attributes': True, 'parse.documentType': True }) convert_one(db, doc) migration_logger.info(f"converted {len(ids)} documents") else: print('Skipping migration. Re-run when you change your mind.')
class TestRunner(unittest.TestCase): default_no_tf_instance: Runner = None @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_get_audits(self): aa = get_audits() for a in aa: print(a['_id']) @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_get_docs_by_audit_id(self): audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs = get_docs_by_audit_id(audit_id, kind='PROTOCOL') for a in docs: print(a['_id'], a['filename']) def _get_doc_from_db(self, kind): audits = get_mongodb_connection()['audits'].find().sort([ ("createDate", pymongo.ASCENDING) ]).limit(1) for audit in audits: doc_ids = get_docs_by_audit_id(audit['_id'], kind=kind, states=[15], id_only=True) if len(doc_ids) > 0: print(doc_ids[0]) doc = finalizer.get_doc_by_id(doc_ids[0]) # jdoc = DbJsonDoc(doc) yield doc def _preprocess_single_doc(self, kind): for doc in self._get_doc_from_db(kind): d = DbJsonDoc(doc) processor = document_processors.get(kind) processor.preprocess(d, AuditContext()) # @unittest.skipIf(SKIP_TF, "requires TF") @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_preprocess_single_protocol(self): self._preprocess_single_doc('PROTOCOL') @unittest.skipIf(get_mongodb_connection() is None is None, "requires mongo") def test_preprocess_single_contract(self): self._preprocess_single_doc('CONTRACT') @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_process_contracts_phase_1(self): # runner = Runner.get_instance() audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs = get_docs_by_audit_id(audit_id, kind='CONTRACT') processor = document_processors.get('CONTRACT') for _doc in docs: jdoc = DbJsonDoc(_doc) processor.preprocess(jdoc, AuditContext()) @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_process_charters_phase_1(self): audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs: [dict] = get_docs_by_audit_id(audit_id, kind='CHARTER') processor = document_processors.get('CHARTER') for _doc in docs: jdoc = DbJsonDoc(_doc) processor.preprocess(jdoc, AuditContext()) @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_process_protocols_phase_1(self): runner = get_runner_instance_no_embedder() for audit in get_audits(): audit_id = audit['_id'] docs = get_docs_by_audit_id(audit_id, kind='PROTOCOL') for doc in docs: # charter = runner.make_legal_doc(doc) jdoc = DbJsonDoc(doc) legal_doc = jdoc.asLegalDoc() runner.protocol_parser.find_org_date_number( legal_doc, AuditContext()) save_analysis(jdoc, legal_doc, -1) # if get_mongodb_connection() is not None: unittest.main(argv=['-e utf-8'], verbosity=3, exit=False)
from analyser.log import logger from analyser.parsing import AuditContext from analyser.persistence import DbJsonDoc from analyser.runner import Runner, get_audits, get_docs_by_audit_id, document_processors, save_analysis from integration.db import get_mongodb_connection SKIP_TF = True def get_runner_instance_no_embedder() -> Runner: if TestRunner.default_no_tf_instance is None: TestRunner.default_no_tf_instance = Runner(init_embedder=False) return TestRunner.default_no_tf_instance @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") class TestRunner(unittest.TestCase): default_no_tf_instance: Runner = None @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_get_audits(self): aa = get_audits() for a in aa: print(a['_id']) @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_get_docs_by_audit_id(self): audits = get_audits() if len(audits) == 0: logger.warning('no audits') return
def _test_protocol(): db = get_mongodb_connection() doc = get_doc_by_id(ObjectId('5df7a66b200a3f4d0fad786f')) # protocol convert_one(db, doc)
def change_audit_status(audit, status): db = get_mongodb_connection() db["audits"].update_one({'_id': audit["_id"]}, {"$set": { "status": status }})
def change_contract_primary_subject(contract, new_subject): db = get_mongodb_connection() db['documents'].update_one({'_id': contract['_id']}, {'$set': { 'primary_subject': new_subject }})
body_features = line_features(bodymap, line_span, ln, _prev_features) body_features['actual'] = 0 _features.append(body_features) _prev_features = body_features.copy() ln += 1 return _features if __name__ == '__main__': features_dicts = [] count = 0 db = get_mongodb_connection() criterion = {'version': WordDocParser.version} res = db['legaldocs'].find(criterion) for resp in res: for d in resp['documents']: doctype = d['documentType'] if doctype in ('CONTRACT', 'PROTOCOL', 'CHARTER'): print(resp['_id']) legal_doc = join_paragraphs(d, resp['_id']) _doc_features = doc_line_features(legal_doc) features_dicts += _doc_features count += 1
def get_doc_by_id(doc_id: ObjectId): db = get_mongodb_connection() documents_collection = db['documents'] return documents_collection.find_one({'_id': doc_id})
def get_audit_by_id(aid: ObjectId): db = get_mongodb_connection() return db['audits'].find_one({'_id': aid})
class AnalyzerTestCase(unittest.TestCase): @unittest.skip def test_analyse_acontract(self): doc = get_doc_by_id(ObjectId('5fdb213f542ce403c92b4530')) # _db_client = MongoClient(f'mongodb://192.168.10.36:27017/') # _db_client.server_info() # db = _db_client['gpn'] # documents_collection = db['documents'] # doc = documents_collection.find_one({"_id": ObjectId('5fdb213f542ce403c92b4530')} ) # audit = db['audits'].find_one({'_id': doc['auditId']}) audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') _audit_subsidiary: str = audit["subsidiary"]["name"] ctx = AuditContext(_audit_subsidiary) processor: BaseProcessor = document_processors[CONTRACT] processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx) print(jdoc) @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_analyze_contract(self): processor: BaseProcessor = document_processors[CONTRACT] doc = get_doc_by_id(ObjectId('5ded004e4ddc27bcf92dd47c')) if doc is None: raise RuntimeError("fix unit test please") audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') ctx = AuditContext() processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx) @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_analyze_protocol(self): processor: BaseProcessor = document_processors[PROTOCOL] doc = get_doc_by_id(ObjectId('5e5de70b01c6c73c19eebd35')) if doc is None: raise RuntimeError("fix unit test please") audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') ctx = AuditContext() processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx) @unittest.skipIf(get_mongodb_connection() is None, "requires mongo") def test_analyze_charter(self): processor: BaseProcessor = document_processors[CHARTER] doc = get_doc_by_id(ObjectId('5e5de70d01c6c73c19eebd48')) if doc is None: raise RuntimeError("fix unit test please") audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') ctx = AuditContext() processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx)
def change_doc_state(doc, state): db = get_mongodb_connection() db['documents'].update_one({'_id': doc.get_id()}, {"$set": { "state": state }})