Exemplos de untokenize em Python, exemplos de text_tools.untokenize em Python

Exemplo n.º 1

0

Exibir arquivo

def extract_constraint_values_from_section(section, verbose=False):
  _embedd_factory = PricePF

  if verbose:
    print('extract_constraint_values_from_sections', section['headline.type'])

  body = section['body.subdoc']

  if verbose:
    print('extract_constraint_values_from_sections', 'embedding....')

  sentenses_i = []
  senetences = split_by_token(body.tokens, '\n')
  for s in senetences:
    line = untokenize(s) + '\n'
    sum = extract_sum(line)
    if sum is not None:
      sentenses_i.append(line)
    if verbose:
      print('-', sum, line)

  hl_subdoc = section['headline.subdoc']

  r_by_head_type = {
    'section': head_types_dict[section['headline.type']],
    'caption': untokenize(hl_subdoc.tokens_cc),
    'sentences': _extract_constraint_values_from_region(sentenses_i, _embedd_factory, render=verbose)
  }

  return r_by_head_type

Exemplo n.º 2

0

Exibir arquivo

def detect_ners(section, render=False):
  assert section is not None
  
  section.embedd(NerPF)
  section.calculate_distances_per_pattern(NerPF)

  dict_org, best_type = _detect_org_type_and_name(section, render)

  if render:
    render_color_text(section.tokens_cc, section.distances_per_pattern_dict[best_type], _range=[0, 1])

  start = dict_org[best_type][0]
  start = start + len(NerPF.patterns_dict[best_type].embeddings)
  end = 1 + find_ner_end(section.tokens, start)

  orgname_sub_section = section.subdoc(start, end)
  org_name = untokenize(orgname_sub_section.tokens_cc)

  if render:
    render_color_text(orgname_sub_section.tokens_cc, orgname_sub_section.distances_per_pattern_dict[best_type],
                      _range=[0, 1])
    print('Org type:', org_types[best_type], dict_org[best_type])

  rez = {
    'type': best_type,
    'name': org_name,
    'type_name': org_types[best_type],
    'tokens': section.tokens_cc,
    'attention_vector': section.distances_per_pattern_dict[best_type]
  }

  return rez

Exemplo n.º 3

0

Exibir arquivo

def find_sections_by_headlines(best_indexes, _doc, headline_indexes, render=False):
  sections = {}

  for bi in best_indexes:

    """
    bi = {
        'headline.index': bi,
        'headline.type': head_type,
        'headline.confidence': distance_by_headline[bi],
        'headline.subdoc': embedded_headlines[bi],
        'headline.attention_v': attention_v}
    """
    hl = best_indexes[bi]
    
    if render:
      print('=' * 100)
      print(untokenize(hl['headline.subdoc'].tokens_cc))
      print('-' * 100)

    head_type = hl['headline.type']

    try:      
      hl['body.subdoc'] = _doc_section_under_headline(_doc, hl, headline_indexes, render=render)
      sections[head_type] = hl
      
    except ValueError as error:
      print(error)

  return sections

Exemplo n.º 4

0

Exibir arquivo

def _doc_section_under_headline(_doc, hl_struct, headline_indices, render=False):
  if render:
    print('_doc_section_under_headline:searching for section:', hl_struct['headline.type'])

  bi = hl_struct['headline.index']

  bi_next = bi + 1
  best_headline = headline_indices[bi]

  if bi_next < len(headline_indices):
    best_headline_next = headline_indices[bi_next]
  else:
    best_headline_next = None

  if render:
    print(
      '_doc_section_under_headline: best_headline:{} best_headline_next:{} bi:{}'.format(best_headline,
                                                                                         best_headline_next, bi),
      '_' * 40)

  subdoc = subdoc_between_lines(best_headline, best_headline_next, _doc)
  if len(subdoc.tokens) < 2:
    raise ValueError(
      'Empty "{}" section between headlines #{} and #{}'.format(hl_struct['headline.type'], best_headline, best_headline_next))

  # May be embedd
  if render:
    print('_doc_section_under_headline: embedding segment:', untokenize(subdoc.tokens_cc))

  

  return subdoc

Exemplo n.º 5

0

Exibir arquivo

def _extract_constraint_values_from_region(sentenses_i, _embedd_factory, render=False):
  if sentenses_i is None or len(sentenses_i)==0:
    return []
  
  ssubdocs = embedd_generic_tokenized_sentences(sentenses_i, _embedd_factory)

  for ssubdoc in ssubdocs:

    vectors = make_constraints_attention_vectors(ssubdoc)
    ssubdoc.distances_per_pattern_dict = {**ssubdoc.distances_per_pattern_dict, **vectors}

    if render:
      render_color_text(
        ssubdoc.tokens,
        ssubdoc.distances_per_pattern_dict['deal_value_attention_vector'], _range=(0, 1))

  sentences = []
  for sentence_subdoc in ssubdocs:
    constraints: List[ValueConstraint] = extract_all_contraints_from_sentence(sentence_subdoc,
                                                                              sentence_subdoc.distances_per_pattern_dict[
                                                                                'deal_value_attention_vector'])

    sentence = {
      'quote': untokenize(sentence_subdoc.tokens_cc),
      'subdoc': sentence_subdoc,
      'constraints': constraints
    }

    sentences.append(sentence)
  return sentences

Exemplo n.º 6

0

Exibir arquivo

Arquivo: charter_parser.py Projeto: compartia/nlp_tools

  def extract_constraint_values_from_section(self, section: HeadlineMeta):

    if self.verbosity_level > 1:
      print('extract_constraint_values_from_section', section.type)

    body = section.body

    body.calculate_distances_per_pattern(self.pattern_factory, pattern_prefix='sum_max', merge=True)
    body.calculate_distances_per_pattern(self.pattern_factory, pattern_prefix='sum__', merge=True)
    body.calculate_distances_per_pattern(self.pattern_factory, pattern_prefix='d_order_', merge=True)

    a_vectors = make_constraints_attention_vectors(body)
    body.distances_per_pattern_dict = {**body.distances_per_pattern_dict, **a_vectors}

    if self.verbosity_level > 1:
      print('extract_constraint_values_from_section', 'embedding....')

    sentenses_having_values: List[LegalDocument] = []
    # senetences = split_by_token(body.tokens, '\n')

    ranges = split_by_token_into_ranges(body.tokens, '\n')

    for _slice in ranges:

      __line = untokenize(body.tokens[_slice])
      _sum = extract_sum(__line)

      if _sum is not None:
        ss_subdoc = body.subdoc_slice(_slice, name=f'value_sent:{_slice.start}')
        sentenses_having_values.append(ss_subdoc)

      if self.verbosity_level > 2:
        print('-', _sum, __line)

    r_by_head_type = {
      'section': head_types_dict[section.type],
      'caption': untokenize(section.subdoc.tokens_cc),
      'sentences': self.__extract_constraint_values_from_region(sentenses_having_values)
    }
    self._logstep(f"Finding margin transaction values in section {untokenize(section.subdoc.tokens_cc)}")
    return r_by_head_type

Exemplo n.º 7

0

Exibir arquivo

    def test_split_by_number(self):
        import nltk
        for (price, currency, text) in data:

            normal_text = normalize_text(
                text,
                replacements_regex)  # TODO: fix nltk problem, use d.parse()
            tokens = nltk.word_tokenize(normal_text)

            a, b, c = split_by_number_2(tokens, np.ones(len(tokens)), 0.1)
            for t in a:
                restored = untokenize(t)
                print('\t-', t)
                self.assertTrue(restored[0].isdigit())

Exemplo n.º 8

0

Exibir arquivo

Arquivo: transaction_values.py Projeto: compartia/nlp_tools

def extract_sum_and_sign_2(subdoc, region: slice) -> ValueConstraint:
    _slice = slice(region.start - VALUE_SIGN_MIN_TOKENS, region.stop)
    subtokens = subdoc.tokens_cc[_slice]
    _prefix_tokens = subtokens[0:VALUE_SIGN_MIN_TOKENS + 1]
    _prefix = untokenize(_prefix_tokens)
    _sign = detect_sign(_prefix)
    # ======================================
    _sum = extract_sum_from_tokens_2(subtokens)
    # ======================================

    currency = "UNDEF"
    value = np.nan
    if _sum is not None:
        currency = _sum[1]
        if _sum[1] in currencly_map:
            currency = currencly_map[_sum[1]]
        value = _sum[0]

    vc = ValueConstraint(value, currency, _sign, TokensWithAttention([], []))

    return vc

Exemplo n.º 9

0

Exibir arquivo

Arquivo: transaction_values.py Projeto: compartia/nlp_tools

def extract_sum_and_sign(subdoc, region) -> ValueConstraint:
    subtokens = subdoc.tokens_cc[region[0] - VALUE_SIGN_MIN_TOKENS:region[1]]
    _prefix_tokens = subtokens[0:VALUE_SIGN_MIN_TOKENS + 1]
    _prefix = untokenize(_prefix_tokens)
    _sign = detect_sign(_prefix)
    # ======================================
    _sum = extract_sum_from_tokens(subtokens)[0]
    # ======================================

    currency = "UNDEF"
    value = np.nan
    if _sum is not None:
        currency = _sum[1]
        if _sum[1] in currencly_map:
            currency = currencly_map[_sum[1]]
        value = _sum[0]

    vc = ValueConstraint(value, currency, _sign, TokensWithAttention([''],
                                                                     [0]))

    return vc

Exemplo n.º 10

0

Exibir arquivo

Arquivo: doc_structure.py Projeto: compartia/nlp_tools

def headline_probability(sentence: List[str], sentence_cc,
                         sentence_meta: StructureLine, prev_sentence,
                         prev_value) -> float:
    """
  _cc == original case
  """

    NEG = -1
    value = 0

    if sentence == ['\n']:
        return NEG

    if len(sentence) < 2:
        return NEG

    if len(sentence) > 20:
        return NEG

    if len(sentence) > 10:
        value -= 2

    # headline is short enough
    if len(sentence) < 10:
        value += 1

    if 3 <= len(sentence) <= 6:
        value += 1

    # headline may not go after another headline
    if prev_value > 0:
        value -= prev_value / 2

    # if it ends with a number, it is a contents-line
    if len(sentence) > 3:
        r_off = 2
        if sentence[-r_off] == '.':
            r_off = 3

        if sentence[-r_off].isdigit():
            value -= 1.8

    # span = sentence_meta.span
    _level = sentence_meta.level
    # number, span, _level = get_tokenized_line_number(sentence, None)
    row = untokenize(sentence_cc[sentence_meta.text_offset:])[:40]
    row = row.lstrip()

    if strange_symbols.search(row) is not None:
        value -= 2

    if sentence_meta.numbered:

        # headline starts from 'статья'
        if sentence[0] == 'статья':
            value += 3

        if sentence_meta.minor_number > 0:
            value += 1

        # headline number is NOT too big
        if sentence_meta.minor_number > 40:
            value -= 1

        # headline is NOT a bullet
        if sentence_meta.minor_number < 0:
            return NEG
        # ----
        if _level is not None:
            if _level == 0:
                value += 1

            if _level > 1:
                # headline is NOT a 1.2 - like-numbered
                return -_level

    # ------- any number
    # headline DOES not start from lowercase
    if len(row) > 0:
        if row.lower()[0] == row[0]:
            value -= 3

    # headline is UPPERCASE
    if row.upper() == row:
        if not row.isdigit():  #there some trash
            value += 1.5

    if prev_sentence == ['\n'] and sentence != ['\n']:
        value += 1

    return value

Exemplo n.º 11

0

Exibir arquivo

Arquivo: doc_structure.py Projeto: compartia/nlp_tools

 def to_string(self, tokens):
     return untokenize(tokens[self.slice])

Exemplo n.º 12

0

Exibir arquivo

Arquivo: doc_structure.py Projeto: compartia/nlp_tools

 def to_string_no_number(self, tokens_cc):
     return untokenize(tokens_cc[self.span[0] +
                                 self.text_offset:self.span[1]])

Exemplo n.º 13

0

Exibir arquivo

Arquivo: transaction_values.py Projeto: compartia/nlp_tools

def extract_sum_from_tokens(sentence_tokens: List):
    sentence = untokenize(sentence_tokens).lower().strip()
    f = extract_sum(sentence)
    return f, sentence

Exemplo n.º 14

0

Exibir arquivo

  def check_contract_value(self, contract_value: ProbableValue, convet_m, renderer):
    greather_lower = False
    greather_upper = False

    if contract_value is None:
      return as_error_html("сумма контракта неизвестна")
    v: ValueConstraint = contract_value.value

    if v is None:
      return as_error_html("сумма контракта не верна")

    if v.value is None:
      return as_error_html(f"сумма контракта не верна {v.currency}")
    ###----

    lower_v = None
    upper_v = None
    if self.lower is not None:
      lower_v: ValueConstraint = self.lower.value
    if self.upper is not None:
      upper_v: ValueConstraint = self.upper.value

    html = as_msg(f"диапазон: {as_currency(lower_v)} < ..... < {as_currency(upper_v)}")

    v, v_converted, h = self.maybe_convert(v, convet_m)
    html += h

    if self.lower is not None:
      lower_v: ValueConstraint = self.lower.value
      lower_v, lower_converted, h = self.maybe_convert(lower_v, convet_m)
      html += h

      if v_converted.value >= lower_converted.value:
        greather_lower = True
        html += as_warning("требуется одобрение...".upper())
        html += as_warning(
          f"сумма договора  {as_currency(v_converted)}  БОЛЬШЕ нижней пороговой {as_currency(lower_converted)} ")
        html += as_quote(untokenize(lower_v.context.tokens))

    if self.upper is not None:

      upper_v: ValueConstraint = self.upper.value
      upper_v, upper_converted, h = self.maybe_convert(upper_v, convet_m)
      html += h

      if v_converted.value >= upper_converted.value:

        html += as_error_html(
          f"сумма договора  {as_currency(v_converted)} БОЛЬШЕ верхней пороговой {as_currency(upper_converted)} ")

      elif greather_lower:
        head_name = self.head_type_name
        html += as_error_html(f'требуется одобрение со стороны "{head_types_dict[head_name]}"')

        if lower_v.context is not None:
          html += as_quote(renderer.to_color_text(lower_v.context.tokens, lower_v.context.attention, _range=[0, 1]))

        if upper_v.context is not None:
          html += '<br>'
          html += as_quote(renderer.to_color_text(upper_v.context.tokens, upper_v.context.attention, _range=[0, 1]))

    return html