def extract_constraint_values_from_section(section, verbose=False): _embedd_factory = PricePF if verbose: print('extract_constraint_values_from_sections', section['headline.type']) body = section['body.subdoc'] if verbose: print('extract_constraint_values_from_sections', 'embedding....') sentenses_i = [] senetences = split_by_token(body.tokens, '\n') for s in senetences: line = untokenize(s) + '\n' sum = extract_sum(line) if sum is not None: sentenses_i.append(line) if verbose: print('-', sum, line) hl_subdoc = section['headline.subdoc'] r_by_head_type = { 'section': head_types_dict[section['headline.type']], 'caption': untokenize(hl_subdoc.tokens_cc), 'sentences': _extract_constraint_values_from_region(sentenses_i, _embedd_factory, render=verbose) } return r_by_head_type
def detect_ners(section, render=False): assert section is not None section.embedd(NerPF) section.calculate_distances_per_pattern(NerPF) dict_org, best_type = _detect_org_type_and_name(section, render) if render: render_color_text(section.tokens_cc, section.distances_per_pattern_dict[best_type], _range=[0, 1]) start = dict_org[best_type][0] start = start + len(NerPF.patterns_dict[best_type].embeddings) end = 1 + find_ner_end(section.tokens, start) orgname_sub_section = section.subdoc(start, end) org_name = untokenize(orgname_sub_section.tokens_cc) if render: render_color_text(orgname_sub_section.tokens_cc, orgname_sub_section.distances_per_pattern_dict[best_type], _range=[0, 1]) print('Org type:', org_types[best_type], dict_org[best_type]) rez = { 'type': best_type, 'name': org_name, 'type_name': org_types[best_type], 'tokens': section.tokens_cc, 'attention_vector': section.distances_per_pattern_dict[best_type] } return rez
def find_sections_by_headlines(best_indexes, _doc, headline_indexes, render=False): sections = {} for bi in best_indexes: """ bi = { 'headline.index': bi, 'headline.type': head_type, 'headline.confidence': distance_by_headline[bi], 'headline.subdoc': embedded_headlines[bi], 'headline.attention_v': attention_v} """ hl = best_indexes[bi] if render: print('=' * 100) print(untokenize(hl['headline.subdoc'].tokens_cc)) print('-' * 100) head_type = hl['headline.type'] try: hl['body.subdoc'] = _doc_section_under_headline(_doc, hl, headline_indexes, render=render) sections[head_type] = hl except ValueError as error: print(error) return sections
def _doc_section_under_headline(_doc, hl_struct, headline_indices, render=False): if render: print('_doc_section_under_headline:searching for section:', hl_struct['headline.type']) bi = hl_struct['headline.index'] bi_next = bi + 1 best_headline = headline_indices[bi] if bi_next < len(headline_indices): best_headline_next = headline_indices[bi_next] else: best_headline_next = None if render: print( '_doc_section_under_headline: best_headline:{} best_headline_next:{} bi:{}'.format(best_headline, best_headline_next, bi), '_' * 40) subdoc = subdoc_between_lines(best_headline, best_headline_next, _doc) if len(subdoc.tokens) < 2: raise ValueError( 'Empty "{}" section between headlines #{} and #{}'.format(hl_struct['headline.type'], best_headline, best_headline_next)) # May be embedd if render: print('_doc_section_under_headline: embedding segment:', untokenize(subdoc.tokens_cc)) return subdoc
def _extract_constraint_values_from_region(sentenses_i, _embedd_factory, render=False): if sentenses_i is None or len(sentenses_i)==0: return [] ssubdocs = embedd_generic_tokenized_sentences(sentenses_i, _embedd_factory) for ssubdoc in ssubdocs: vectors = make_constraints_attention_vectors(ssubdoc) ssubdoc.distances_per_pattern_dict = {**ssubdoc.distances_per_pattern_dict, **vectors} if render: render_color_text( ssubdoc.tokens, ssubdoc.distances_per_pattern_dict['deal_value_attention_vector'], _range=(0, 1)) sentences = [] for sentence_subdoc in ssubdocs: constraints: List[ValueConstraint] = extract_all_contraints_from_sentence(sentence_subdoc, sentence_subdoc.distances_per_pattern_dict[ 'deal_value_attention_vector']) sentence = { 'quote': untokenize(sentence_subdoc.tokens_cc), 'subdoc': sentence_subdoc, 'constraints': constraints } sentences.append(sentence) return sentences
def extract_constraint_values_from_section(self, section: HeadlineMeta): if self.verbosity_level > 1: print('extract_constraint_values_from_section', section.type) body = section.body body.calculate_distances_per_pattern(self.pattern_factory, pattern_prefix='sum_max', merge=True) body.calculate_distances_per_pattern(self.pattern_factory, pattern_prefix='sum__', merge=True) body.calculate_distances_per_pattern(self.pattern_factory, pattern_prefix='d_order_', merge=True) a_vectors = make_constraints_attention_vectors(body) body.distances_per_pattern_dict = {**body.distances_per_pattern_dict, **a_vectors} if self.verbosity_level > 1: print('extract_constraint_values_from_section', 'embedding....') sentenses_having_values: List[LegalDocument] = [] # senetences = split_by_token(body.tokens, '\n') ranges = split_by_token_into_ranges(body.tokens, '\n') for _slice in ranges: __line = untokenize(body.tokens[_slice]) _sum = extract_sum(__line) if _sum is not None: ss_subdoc = body.subdoc_slice(_slice, name=f'value_sent:{_slice.start}') sentenses_having_values.append(ss_subdoc) if self.verbosity_level > 2: print('-', _sum, __line) r_by_head_type = { 'section': head_types_dict[section.type], 'caption': untokenize(section.subdoc.tokens_cc), 'sentences': self.__extract_constraint_values_from_region(sentenses_having_values) } self._logstep(f"Finding margin transaction values in section {untokenize(section.subdoc.tokens_cc)}") return r_by_head_type
def test_split_by_number(self): import nltk for (price, currency, text) in data: normal_text = normalize_text( text, replacements_regex) # TODO: fix nltk problem, use d.parse() tokens = nltk.word_tokenize(normal_text) a, b, c = split_by_number_2(tokens, np.ones(len(tokens)), 0.1) for t in a: restored = untokenize(t) print('\t-', t) self.assertTrue(restored[0].isdigit())
def extract_sum_and_sign_2(subdoc, region: slice) -> ValueConstraint: _slice = slice(region.start - VALUE_SIGN_MIN_TOKENS, region.stop) subtokens = subdoc.tokens_cc[_slice] _prefix_tokens = subtokens[0:VALUE_SIGN_MIN_TOKENS + 1] _prefix = untokenize(_prefix_tokens) _sign = detect_sign(_prefix) # ====================================== _sum = extract_sum_from_tokens_2(subtokens) # ====================================== currency = "UNDEF" value = np.nan if _sum is not None: currency = _sum[1] if _sum[1] in currencly_map: currency = currencly_map[_sum[1]] value = _sum[0] vc = ValueConstraint(value, currency, _sign, TokensWithAttention([], [])) return vc
def extract_sum_and_sign(subdoc, region) -> ValueConstraint: subtokens = subdoc.tokens_cc[region[0] - VALUE_SIGN_MIN_TOKENS:region[1]] _prefix_tokens = subtokens[0:VALUE_SIGN_MIN_TOKENS + 1] _prefix = untokenize(_prefix_tokens) _sign = detect_sign(_prefix) # ====================================== _sum = extract_sum_from_tokens(subtokens)[0] # ====================================== currency = "UNDEF" value = np.nan if _sum is not None: currency = _sum[1] if _sum[1] in currencly_map: currency = currencly_map[_sum[1]] value = _sum[0] vc = ValueConstraint(value, currency, _sign, TokensWithAttention([''], [0])) return vc
def headline_probability(sentence: List[str], sentence_cc, sentence_meta: StructureLine, prev_sentence, prev_value) -> float: """ _cc == original case """ NEG = -1 value = 0 if sentence == ['\n']: return NEG if len(sentence) < 2: return NEG if len(sentence) > 20: return NEG if len(sentence) > 10: value -= 2 # headline is short enough if len(sentence) < 10: value += 1 if 3 <= len(sentence) <= 6: value += 1 # headline may not go after another headline if prev_value > 0: value -= prev_value / 2 # if it ends with a number, it is a contents-line if len(sentence) > 3: r_off = 2 if sentence[-r_off] == '.': r_off = 3 if sentence[-r_off].isdigit(): value -= 1.8 # span = sentence_meta.span _level = sentence_meta.level # number, span, _level = get_tokenized_line_number(sentence, None) row = untokenize(sentence_cc[sentence_meta.text_offset:])[:40] row = row.lstrip() if strange_symbols.search(row) is not None: value -= 2 if sentence_meta.numbered: # headline starts from 'статья' if sentence[0] == 'статья': value += 3 if sentence_meta.minor_number > 0: value += 1 # headline number is NOT too big if sentence_meta.minor_number > 40: value -= 1 # headline is NOT a bullet if sentence_meta.minor_number < 0: return NEG # ---- if _level is not None: if _level == 0: value += 1 if _level > 1: # headline is NOT a 1.2 - like-numbered return -_level # ------- any number # headline DOES not start from lowercase if len(row) > 0: if row.lower()[0] == row[0]: value -= 3 # headline is UPPERCASE if row.upper() == row: if not row.isdigit(): #there some trash value += 1.5 if prev_sentence == ['\n'] and sentence != ['\n']: value += 1 return value
def to_string(self, tokens): return untokenize(tokens[self.slice])
def to_string_no_number(self, tokens_cc): return untokenize(tokens_cc[self.span[0] + self.text_offset:self.span[1]])
def extract_sum_from_tokens(sentence_tokens: List): sentence = untokenize(sentence_tokens).lower().strip() f = extract_sum(sentence) return f, sentence
def check_contract_value(self, contract_value: ProbableValue, convet_m, renderer): greather_lower = False greather_upper = False if contract_value is None: return as_error_html("сумма контракта неизвестна") v: ValueConstraint = contract_value.value if v is None: return as_error_html("сумма контракта не верна") if v.value is None: return as_error_html(f"сумма контракта не верна {v.currency}") ###---- lower_v = None upper_v = None if self.lower is not None: lower_v: ValueConstraint = self.lower.value if self.upper is not None: upper_v: ValueConstraint = self.upper.value html = as_msg(f"диапазон: {as_currency(lower_v)} < ..... < {as_currency(upper_v)}") v, v_converted, h = self.maybe_convert(v, convet_m) html += h if self.lower is not None: lower_v: ValueConstraint = self.lower.value lower_v, lower_converted, h = self.maybe_convert(lower_v, convet_m) html += h if v_converted.value >= lower_converted.value: greather_lower = True html += as_warning("требуется одобрение...".upper()) html += as_warning( f"сумма договора {as_currency(v_converted)} БОЛЬШЕ нижней пороговой {as_currency(lower_converted)} ") html += as_quote(untokenize(lower_v.context.tokens)) if self.upper is not None: upper_v: ValueConstraint = self.upper.value upper_v, upper_converted, h = self.maybe_convert(upper_v, convet_m) html += h if v_converted.value >= upper_converted.value: html += as_error_html( f"сумма договора {as_currency(v_converted)} БОЛЬШЕ верхней пороговой {as_currency(upper_converted)} ") elif greather_lower: head_name = self.head_type_name html += as_error_html(f'требуется одобрение со стороны "{head_types_dict[head_name]}"') if lower_v.context is not None: html += as_quote(renderer.to_color_text(lower_v.context.tokens, lower_v.context.attention, _range=[0, 1])) if upper_v.context is not None: html += '<br>' html += as_quote(renderer.to_color_text(upper_v.context.tokens, upper_v.context.attention, _range=[0, 1])) return html