def embedd_generic_tokenized_sentences(strings: List[str], factory: AbstractPatternFactory) -> \ List[LegalDocument]: embedded_docs = [] if strings is None or len(strings)==0: return [] tokenized_sentences_list = [] for i in range(len(strings)): s = strings[i] words = nltk.word_tokenize(s) subdoc = LegalDocument() subdoc.tokens = words subdoc.tokens_cc = words tokenized_sentences_list.append(subdoc.tokens) embedded_docs.append(subdoc) sentences_emb, wrds, lens = embedd_tokenized_sentences_list(factory.embedder, tokenized_sentences_list) for i in range(len(embedded_docs)): l = lens[i] tokens = wrds[i][:l] line_emb = sentences_emb[i][:l] embedded_docs[i].tokens = tokens embedded_docs[i].tokens_cc = tokens embedded_docs[i].embeddings = line_emb embedded_docs[i].calculate_distances_per_pattern(factory) return embedded_docs
def _detect_org_type_and_name(self, section: LegalDocument): factory = self.pattern_factory vectors = section.distances_per_pattern_dict # shortcut section.calculate_distances_per_pattern(factory, pattern_prefix='org_', merge=True) section.calculate_distances_per_pattern(factory, pattern_prefix='ner_org', merge=True) section.calculate_distances_per_pattern(factory, pattern_prefix='nerneg_', merge=True) vectors['s_attention_vector_neg'] = factory._build_org_type_attention_vector(section) org_by_type = {} best_org_type = None _max = 0 for org_type in org_types.keys(): vector = vectors[org_type] * vectors['s_attention_vector_neg'] if self.verbosity_level > 2: print('_detect_org_type_and_name, org_type=', org_type, vectors[org_type][0:10]) idx = np.argmax(vector) val = vectors[org_type][idx] if val > _max: _max = val best_org_type = org_type org_by_type[org_type] = [idx, val] if self.verbosity_level > 2: print('_detect_org_type_and_name', org_by_type) return org_by_type, best_org_type
def map_subject_to_type(self, section: LegalDocument, denominator: float = 1) -> List[ProbableValue]: """ :param section: :param denominator: confidence multiplyer :return: """ section.calculate_distances_per_pattern( self.pattern_factory, merge=True, pattern_prefix='x_ContractSubject') all_subjects_vectors = filter_values_by_key_prefix( section.distances_per_pattern_dict, 'x_ContractSubject') all_mean = rectifyed_sum(all_subjects_vectors) subjects_mapping = [] for subject_kind in contract_subjects: x = self.make_subject_attention_vector_3(section, subject_kind, all_mean) # confidence, sum_, nonzeros_count, _max = estimate_confidence(x) confidence = self.estimate_confidence_2(x) confidence *= denominator pv = ProbableValue(subject_kind, confidence) subjects_mapping.append(pv) return subjects_mapping
def find_sections(self, doc: LegalDocument, factory: AbstractPatternFactory, headlines: List[str], headline_patterns_prefix: str = 'headline.', additional_attention: List[float] = None) -> dict: embedded_headlines = doc.embedd_headlines(factory) doc.sections = doc.find_sections_by_headlines_2( self.ctx, headlines, embedded_headlines, headline_patterns_prefix, self.ctx.config.headline_attention_threshold) self.ctx._logstep("embedding headlines into semantic space") return doc.sections
def _try_to_fetch_value_from_section_2( value_section: LegalDocument, factory: ContractPatternFactory) -> List[ProbableValue]: value_section.calculate_distances_per_pattern(factory) vectors = factory.make_contract_value_attention_vectors(value_section) value_section.distances_per_pattern_dict = { **value_section.distances_per_pattern_dict, **vectors } values: List[ProbableValue] = extract_all_contraints_from_sr_2( value_section, value_section. distances_per_pattern_dict['value_attention_vector_tuned']) return values
def _try_to_fetch_value_from_section___( value_section: LegalDocument, factory: ContractPatternFactory) -> List[ProbableValue]: # value_section.embedd(factory) value_section.calculate_distances_per_pattern(factory) # context._logstep(f'embedding for transaction values in section "{ section_name }"') vectors = factory.make_contract_value_attention_vectors(value_section) value_section.distances_per_pattern_dict = { **value_section.distances_per_pattern_dict, **vectors } values: List[ProbableValue] = extract_all_contraints_from_sentence( value_section, value_section. distances_per_pattern_dict['value_attention_vector_tuned']) return values
def __init__(self, original_text: str): LegalDocument.__init__(self, original_text) self.subject = ('unknown', 1.0) self.contract_values = [ProbableValue]
def find_sections(self, doc: LegalDocument, factory: AbstractPatternFactory, headlines: List[str], headline_patterns_prefix: str = 'headline.', additional_attention: List[float] = None) -> dict: """ Fuzziy Finds sections in the doc TODO: try it on Contracts and Protocols as well TODO: if well, move from here 🍄 🍄 🍄 🍄 🍄 Keep in in the dark and feed it sh** """ def is_hl_more_confident(a: HeadlineMeta, b: HeadlineMeta): return a.confidence > b.confidence # assert do headlines_attention_vector = self.normalize_headline_attention_vector(self.make_headline_attention_vector(doc)) section_by_index = {} for section_type in headlines: # like ['name.', 'head.all.', 'head.gen.', 'head.directors.']: pattern_prefix = f'{headline_patterns_prefix}{section_type}' doc.calculate_distances_per_pattern(factory, pattern_prefix=pattern_prefix, merge=True) # warning! these are the boundaries of the headline, not of the entire section bounds, confidence, attention = self._find_charter_section_start(doc, pattern_prefix, headlines_attention_vector, additional_attention) if confidence > 0.5: sl = slice(bounds[0], bounds[1]) hl_info = HeadlineMeta(None, section_type, confidence, doc.subdoc_slice(sl, name=section_type)) hl_info.attention = attention put_if_better(section_by_index, key=sl.start, x=hl_info, is_better=is_hl_more_confident) # end-for # s = slice(bounds[0], bounds[1]) # now slicing the doc sorted_starts = [i for i in sorted(section_by_index.keys())] # // sorted_starts.append(len(doc.tokens)) section_by_type = {} for i in range(len(sorted_starts) ): index = sorted_starts[i] section: HeadlineMeta = section_by_index[index] start = index # todo: probably take the end of the caption end = doc.structure.next_headline_after(start) # end_alt = sorted_starts[i + 1] # section_len = end - start # if section_len > 5000: # self.ctx.warning( # f'Section "{section.subdoc.untokenize_cc()[:100]}" is probably way too large {section_len}, timming to 5000 ') # section_len = 5000 # sli = slice(start, start + section_len) section.body = doc.subdoc_slice(sli, name=section.type) section.attention = section.attention[sli] section_by_type[section.type] = section # end-for doc.sections = section_by_type self.ctx._logstep("Splitting Document into sections ✂️ 📃 -> 📄📄📄") return section_by_type
def __init__(self, original_text): LegalDocument.__init__(self, original_text)
def __init__(self, original_text): LegalDocument.__init__(self, original_text) self.subjects: List[ProbableValue] = [ ProbableValue(ContractSubject.Other, 0.0) ] self.contract_values: [ProbableValue] = []
def test_tokenize_doc_custom_padding(self): doc = LegalDocument() tokens = doc.tokenize('aa bb cc') print(tokens) self.assertEqual(3, len(tokens))