示例#1
0
def embedd_generic_tokenized_sentences(strings: List[str], factory: AbstractPatternFactory) -> \
        List[LegalDocument]:
  embedded_docs = []
  if strings is None or len(strings)==0:
    return []

  tokenized_sentences_list = []
  for i in range(len(strings)):
    s = strings[i]

    words = nltk.word_tokenize(s)

    subdoc = LegalDocument()

    subdoc.tokens = words
    subdoc.tokens_cc = words

    tokenized_sentences_list.append(subdoc.tokens)
    embedded_docs.append(subdoc)

  sentences_emb, wrds, lens = embedd_tokenized_sentences_list(factory.embedder, tokenized_sentences_list)

  for i in range(len(embedded_docs)):
    l = lens[i]
    tokens = wrds[i][:l]

    line_emb = sentences_emb[i][:l]

    embedded_docs[i].tokens = tokens
    embedded_docs[i].tokens_cc = tokens
    embedded_docs[i].embeddings = line_emb
    embedded_docs[i].calculate_distances_per_pattern(factory)

  return embedded_docs
示例#2
0
  def _detect_org_type_and_name(self, section: LegalDocument):

    factory = self.pattern_factory
    vectors = section.distances_per_pattern_dict  # shortcut

    section.calculate_distances_per_pattern(factory, pattern_prefix='org_', merge=True)
    section.calculate_distances_per_pattern(factory, pattern_prefix='ner_org', merge=True)
    section.calculate_distances_per_pattern(factory, pattern_prefix='nerneg_', merge=True)

    vectors['s_attention_vector_neg'] = factory._build_org_type_attention_vector(section)

    org_by_type = {}
    best_org_type = None
    _max = 0
    for org_type in org_types.keys():

      vector = vectors[org_type] * vectors['s_attention_vector_neg']
      if self.verbosity_level > 2:
        print('_detect_org_type_and_name, org_type=', org_type, vectors[org_type][0:10])

      idx = np.argmax(vector)
      val = vectors[org_type][idx]
      if val > _max:
        _max = val
        best_org_type = org_type

      org_by_type[org_type] = [idx, val]

    if self.verbosity_level > 2:
      print('_detect_org_type_and_name', org_by_type)

    return org_by_type, best_org_type
示例#3
0
    def map_subject_to_type(self,
                            section: LegalDocument,
                            denominator: float = 1) -> List[ProbableValue]:
        """
    :param section:
    :param denominator: confidence multiplyer
    :return:
    """
        section.calculate_distances_per_pattern(
            self.pattern_factory,
            merge=True,
            pattern_prefix='x_ContractSubject')
        all_subjects_vectors = filter_values_by_key_prefix(
            section.distances_per_pattern_dict, 'x_ContractSubject')
        all_mean = rectifyed_sum(all_subjects_vectors)

        subjects_mapping = []
        for subject_kind in contract_subjects:
            x = self.make_subject_attention_vector_3(section, subject_kind,
                                                     all_mean)
            # confidence, sum_, nonzeros_count, _max = estimate_confidence(x)
            confidence = self.estimate_confidence_2(x)
            confidence *= denominator
            pv = ProbableValue(subject_kind, confidence)
            subjects_mapping.append(pv)

        return subjects_mapping
示例#4
0
  def find_sections(self, doc: LegalDocument, factory: AbstractPatternFactory, headlines: List[str],
                    headline_patterns_prefix: str = 'headline.', additional_attention: List[float] = None) -> dict:
    embedded_headlines = doc.embedd_headlines(factory)

    doc.sections = doc.find_sections_by_headlines_2(
      self.ctx, headlines, embedded_headlines, headline_patterns_prefix, self.ctx.config.headline_attention_threshold)

    self.ctx._logstep("embedding headlines into semantic space")

    return doc.sections
示例#5
0
def _try_to_fetch_value_from_section_2(
        value_section: LegalDocument,
        factory: ContractPatternFactory) -> List[ProbableValue]:
    value_section.calculate_distances_per_pattern(factory)

    vectors = factory.make_contract_value_attention_vectors(value_section)

    value_section.distances_per_pattern_dict = {
        **value_section.distances_per_pattern_dict,
        **vectors
    }

    values: List[ProbableValue] = extract_all_contraints_from_sr_2(
        value_section, value_section.
        distances_per_pattern_dict['value_attention_vector_tuned'])

    return values
示例#6
0
def _try_to_fetch_value_from_section___(
        value_section: LegalDocument,
        factory: ContractPatternFactory) -> List[ProbableValue]:
    # value_section.embedd(factory)
    value_section.calculate_distances_per_pattern(factory)

    # context._logstep(f'embedding for transaction values in section  "{ section_name }"')

    vectors = factory.make_contract_value_attention_vectors(value_section)

    value_section.distances_per_pattern_dict = {
        **value_section.distances_per_pattern_dict,
        **vectors
    }

    values: List[ProbableValue] = extract_all_contraints_from_sentence(
        value_section, value_section.
        distances_per_pattern_dict['value_attention_vector_tuned'])

    return values
示例#7
0
 def __init__(self, original_text: str):
     LegalDocument.__init__(self, original_text)
     self.subject = ('unknown', 1.0)
     self.contract_values = [ProbableValue]
示例#8
0
  def find_sections(self, doc: LegalDocument, factory: AbstractPatternFactory, headlines: List[str],
                    headline_patterns_prefix: str = 'headline.', additional_attention: List[float] = None) -> dict:

    """
    Fuzziy Finds sections in the doc
    TODO: try it on Contracts and Protocols as well
    TODO: if well, move from here

    🍄 🍄 🍄 🍄 🍄 Keep in in the dark and feed it sh**



    """

    def is_hl_more_confident(a: HeadlineMeta, b: HeadlineMeta):
      return a.confidence > b.confidence

    #     assert do
    headlines_attention_vector = self.normalize_headline_attention_vector(self.make_headline_attention_vector(doc))

    section_by_index = {}
    for section_type in headlines:
      # like ['name.', 'head.all.', 'head.gen.', 'head.directors.']:
      pattern_prefix = f'{headline_patterns_prefix}{section_type}'
      doc.calculate_distances_per_pattern(factory, pattern_prefix=pattern_prefix, merge=True)

      # warning! these are the boundaries of the headline, not of the entire section
      bounds, confidence, attention = self._find_charter_section_start(doc, pattern_prefix, headlines_attention_vector,
                                                                       additional_attention)

      if confidence > 0.5:
        sl = slice(bounds[0], bounds[1])
        hl_info = HeadlineMeta(None, section_type, confidence, doc.subdoc_slice(sl, name=section_type))
        hl_info.attention = attention
        put_if_better(section_by_index, key=sl.start, x=hl_info, is_better=is_hl_more_confident)



    # end-for
    # s = slice(bounds[0], bounds[1])
    # now slicing the doc
    sorted_starts = [i for i in sorted(section_by_index.keys())]
    # // sorted_starts.append(len(doc.tokens))

    section_by_type = {}

    for i in range(len(sorted_starts)  ):
      index = sorted_starts[i]
      section: HeadlineMeta = section_by_index[index]
      start = index  # todo: probably take the end of the caption
      end = doc.structure.next_headline_after(start)

      # end_alt = sorted_starts[i + 1]
      #
      section_len = end - start
      # if section_len > 5000:
      #   self.ctx.warning(
      #     f'Section "{section.subdoc.untokenize_cc()[:100]}" is probably way too large {section_len}, timming to 5000 ')
      #   section_len = 5000  #

      sli = slice(start, start + section_len)
      section.body = doc.subdoc_slice(sli, name=section.type)
      section.attention = section.attention[sli]
      section_by_type[section.type] = section

    # end-for
    doc.sections = section_by_type

    self.ctx._logstep("Splitting Document into sections ✂️ 📃 -> 📄📄📄")
    return section_by_type
示例#9
0
 def __init__(self, original_text):
   LegalDocument.__init__(self, original_text)
示例#10
0
 def __init__(self, original_text):
     LegalDocument.__init__(self, original_text)
     self.subjects: List[ProbableValue] = [
         ProbableValue(ContractSubject.Other, 0.0)
     ]
     self.contract_values: [ProbableValue] = []
示例#11
0
    def test_tokenize_doc_custom_padding(self):
        doc = LegalDocument()

        tokens = doc.tokenize('aa bb cc')
        print(tokens)
        self.assertEqual(3, len(tokens))