def test_lease_doc_detection(): """ Tests lease document detector on a totally new lease agreement template downloaded from Internet which did not participate in the original dataset on which the model was tested/trained. TODO: Add more data when this code moves to core project. :return: """ detector = LeaseDocDetector() is_lease = detector.is_lease_document(LEASE_AGREEMENT_1) assert_equal(True, is_lease)
class ProcessLeaseDocuments(BaseTask): name = 'Process Lease Documents' lease_doc_detector = LeaseDocDetector() def process(self, **kwargs): self.log_info( "Going to detect lease documents among the all loaded documents in the system...") if kwargs.get('delete'): for ld in LeaseDocument.objects.all(): ld.delete(keep_parents=True) documents = Document.objects.all() # TODO: outdated if kwargs.get('document_type'): documents = documents.filter(document_type__in=kwargs['document_type']) self.log_info( 'Filter documents by "%s" document type.' % str(kwargs['document_type'])) if kwargs.get('document_id'): documents = documents.filter(pk=kwargs['document_id']) self.log_info('Process document id={}.'.format(kwargs['document_id'])) detect_and_process_lease_document_args = [] for row in documents.values_list('id'): detect_and_process_lease_document_args \ .append((row[0], kwargs.get('no_detect', True))) self.run_sub_tasks('Detect And Process Each Lease Document', ProcessLeaseDocuments.detect_and_process_lease_document, detect_and_process_lease_document_args) @staticmethod @shared_task(base=ExtendedTask, bind=True) def detect_and_process_lease_document(task: ExtendedTask, document_id: int, no_detect: bool): doc = Document.objects.get(pk=document_id) doc_text = doc.full_text try: lease_doc = LeaseDocument.objects.get(pk=document_id) except: lease_doc = None if lease_doc or no_detect or ProcessLeaseDocuments.lease_doc_detector.is_lease_document( doc_text): task.log_info('{2} lease document: #{0}. {1}' .format(document_id, doc.name, 'Processing' if no_detect else 'Detected')) if not lease_doc: lease_doc = LeaseDocument(document_ptr=doc) lease_doc.__dict__.update(doc.__dict__) ProcessLeaseDocuments.process_landlord_tenant(lease_doc, doc_text) ProcessLeaseDocuments.process_fields(lease_doc, doc_text, task) lease_doc.save() else: task.log_info('Not a lease document: #{0}. {1}'.format(document_id, doc.name)) @staticmethod def get_or_create_party(company_desc: Tuple) -> Party: name, _type, type_abbr, type_label, type_desc = company_desc defaults = dict( type=_type.upper() if _type else None, type_label=type_label.upper() if type_label else None, type_description=type_desc.upper() if type_desc else None ) party, _ = Party.objects.get_or_create( name=name.upper() if name else None, type_abbr=type_abbr.upper() if type_abbr else None, defaults=defaults ) return party @staticmethod def process_landlord_tenant(doc: LeaseDocument, doc_text: str): landlord, tenant = find_landlord_tenant(doc_text) doc.lessor = landlord doc.lessee = tenant @staticmethod def ordered_list_without_repetitions(sentence_list: List[str], separator: str = '\n'): if not sentence_list: return None sentence_list_no_repeat = list() for sentence in sentence_list: if sentence not in sentence_list_no_repeat: sentence_list_no_repeat.append(sentence) return separator.join(sentence_list_no_repeat) @staticmethod def process_fields(doc: LeaseDocument, doc_text: str, task: ExtendedTask): sentences = get_sentence_list(doc_text) # fields = detect_fields(sentences, groups=('address',)) fields = detect_fields(sentences) doc.address = fields.get('address') if not doc.address: doc.address = detect_address_default(doc_text, sentences) # term doc.commencement_date = fields.get('commencement_date') doc.expiration_date = fields.get('expiration_date') term_tuple = fields.get('term') if term_tuple: term = timedelta(days=term_tuple[2]) if doc.commencement_date and not doc.expiration_date: doc.expiration_date = doc.commencement_date + term elif not doc.commencement_date and doc.expiration_date: doc.commencement_date = doc.expiration_date - term if doc.commencement_date \ and doc.expiration_date \ and doc.commencement_date >= doc.expiration_date: doc.expiration_date = None # lease type pay_taxes = int(fields.get('pay_taxes') or False) pay_costs = int(fields.get('pay_costs') or False) pay_insurance = int(fields.get('pay_insurance') or False) lt = pay_taxes + pay_costs + pay_insurance if lt == 3: doc.lease_type = 'triple-net' elif lt == 2: doc.lease_type = 'double-net' elif lt == 1: doc.lease_type = 'single-net' else: doc.lease_type = 'gross' # property type property_types = list(fields.get('property_types__set') or set()) property_types.sort() doc.property_type = '; '.join(property_types) # permitted use doc.permitted_uses = fields.get('permitted_use') # prohibited use doc.prohibited_uses = ProcessLeaseDocuments.ordered_list_without_repetitions( fields.get('prohibited_use__list')) renew_duration_tuple = fields.get('renew_non_renew_notice') if renew_duration_tuple: doc.renew_non_renew_notice_duration = timedelta(days=renew_duration_tuple[2]) auto_renew = fields.get('auto_renew') if auto_renew is not None: doc.auto_renew = auto_renew area_square_feet_list = fields.get('area_square_feet__list') if area_square_feet_list: doc.area_size_sq_ft = area_square_feet_list[0] doc.alterations_allowed = ProcessLeaseDocuments.ordered_list_without_repetitions( fields.get('alterations_allowed__list')) security_deposit = fields.get('security_deposit__set') if security_deposit: doc.security_deposit = max(security_deposit) doc.rent_due_frequency = fields.get('rent_due_frequency') mean_rent_per_month = fields.get('mean_rent_per_month__set') if mean_rent_per_month: doc.mean_rent_per_month = max(mean_rent_per_month)
class ProcessLeaseDocuments(BaseTask): name = 'Process Lease Documents' lease_doc_detector = LeaseDocDetector() def process(self, **kwargs): self.log( "Going to detect lease documents among the all loaded documents in the system..." ) if kwargs['delete']: for ld in LeaseDocument.objects.all(): ld.delete(keep_parents=True) documents = Document.objects.all() if kwargs.get('document_type'): documents = documents.filter( document_type__in=kwargs['document_type']) self.log('Filter documents by "%s" document type.' % str(kwargs['document_type'])) self.task.subtasks_total = documents.count() self.task.save() for row in documents.values_list('id'): ProcessLeaseDocuments.detect_and_process_lease_document.apply_async( args=(row[0], kwargs['no_detect'], kwargs['task_id']), task_id='%d_%s' % (self.task.id, fast_uuid())) @staticmethod @shared_task def detect_and_process_lease_document(document_id: int, no_detect: bool, task_id): doc = Document.objects.get(pk=document_id) doc_text = doc.full_text try: lease_doc = LeaseDocument.objects.get(pk=document_id) except: lease_doc = None if lease_doc or no_detect or ProcessLeaseDocuments.lease_doc_detector.is_lease_document( doc_text): log('{2} lease document: #{0}. {1}'.format( document_id, doc.name, 'Processing' if no_detect else 'Detected'), task=task_id) if not lease_doc: lease_doc = LeaseDocument(document_ptr=doc) lease_doc.__dict__.update(doc.__dict__) ProcessLeaseDocuments.process_landlord_tenant(lease_doc, doc_text) ProcessLeaseDocuments.process_fields(lease_doc, doc_text, task_id) lease_doc.save() else: log('Not a lease document: #{0}. {1}'.format( document_id, doc.name), task=task_id) @staticmethod def get_or_create_party(company_desc: Tuple) -> Party: name, _type, type_abbr, type_label, type_desc = company_desc defaults = dict( type=_type.upper() if _type else None, type_label=type_label.upper() if type_label else None, type_description=type_desc.upper() if type_desc else None) party, _ = Party.objects.get_or_create( name=name.upper() if name else None, type_abbr=type_abbr.upper() if type_abbr else None, defaults=defaults) return party @staticmethod def process_landlord_tenant(doc: LeaseDocument, doc_text: str): landlord, tenant = find_landlord_tenant(doc_text) doc.lessor = landlord doc.lessee = tenant @staticmethod def ordered_list_without_repetitions(sentence_list: List[str], separator: str = '\n'): if not sentence_list: return None sentence_list_no_repeat = list() for sentence in sentence_list: if sentence not in sentence_list_no_repeat: sentence_list_no_repeat.append(sentence) return separator.join(sentence_list_no_repeat) @staticmethod def process_fields(doc: LeaseDocument, doc_text: str, task_id): sentences = get_sentence_list(doc_text) # fields = detect_fields(sentences, groups=('address',)) fields = detect_fields(sentences) doc.address = fields.get('address') if not doc.address: doc.address = detect_address_default(doc_text, sentences) if doc.address: g = geocoder.google(doc.address) if g.ok: doc.address_latitude = g.lat doc.address_longitude = g.lng doc.address_country = g.country_long doc.address_state_province = g.province_long elif g.status and 'ZERO' in g.status: # Google does not know such address - probably we detected it wrong. doc.address = None doc.address_state_province = None doc.address_country = None doc.address_longitude = None doc.address_latitude = None else: log('Google did not return geocode info for: {0}\nResponse: {1}' .format(doc.address, g), task=task_id) # return # term doc.commencement_date = fields.get('commencement_date') doc.expiration_date = fields.get('expiration_date') term_tuple = fields.get('term') if term_tuple: term = timedelta(days=term_tuple[2]) if doc.commencement_date and not doc.expiration_date: doc.expiration_date = doc.commencement_date + term elif not doc.commencement_date and doc.expiration_date: doc.commencement_date = doc.expiration_date - term if doc.commencement_date \ and doc.expiration_date \ and doc.commencement_date >= doc.expiration_date: doc.expiration_date = None # lease type pay_taxes = int(fields.get('pay_taxes') or False) pay_costs = int(fields.get('pay_costs') or False) pay_insurance = int(fields.get('pay_insurance') or False) lt = pay_taxes + pay_costs + pay_insurance if lt == 3: doc.lease_type = 'triple-net' elif lt == 2: doc.lease_type = 'double-net' elif lt == 1: doc.lease_type = 'single-net' else: doc.lease_type = 'gross' # property type property_types = list(fields.get('property_types__set') or set()) property_types.sort() doc.property_type = '; '.join(property_types) # permitted use doc.permitted_uses = fields.get('permitted_use') # prohibited use doc.prohibited_uses = ProcessLeaseDocuments.ordered_list_without_repetitions( fields.get('prohibited_use__list')) renew_duration_tuple = fields.get('renew_non_renew_notice') if renew_duration_tuple: doc.renew_non_renew_notice_duration = timedelta( days=renew_duration_tuple[2]) auto_renew = fields.get('auto_renew') if auto_renew is not None: doc.auto_renew = auto_renew area_square_feet_list = fields.get('area_square_feet__list') if area_square_feet_list: doc.area_size_sq_ft = area_square_feet_list[0] doc.alterations_allowed = ProcessLeaseDocuments.ordered_list_without_repetitions( fields.get('alterations_allowed__list')) security_deposit = fields.get('security_deposit__set') if security_deposit: doc.security_deposit = max(security_deposit) doc.rent_due_frequency = fields.get('rent_due_frequency') mean_rent_per_month = fields.get('mean_rent_per_month__set') if mean_rent_per_month: doc.mean_rent_per_month = max(mean_rent_per_month)