Пример #1
0
    def test_doc_parser(self):
        db = get_mongodb_connection()
        if db is None:  # TODO: this is a weird way of detecting we're on CI
            return

        FILENAME = "/Users/artem/work/nemo/goil/IN/Другие договоры/Договор Формула.docx"

        wp = WordDocParser()
        res = wp.read_doc(FILENAME)

        doc: LegalDocument = LegalDocument('')
        doc.parse()

        last = 0
        for d in res['documents']:
            for p in d['paragraphs']:
                header_text = p['paragraphHeader']['text'] + '\n'
                body_text = p['paragraphBody']['text'] + '\n'

                header = LegalDocument(header_text)
                header.parse()
                # self.assertEqual(self.n(header_text), header.text)

                doc += header
                headerspan = (last, len(doc.tokens_map))
                print(headerspan)
                last = len(doc.tokens_map)

                body = LegalDocument(body_text)
                body.parse()
                doc += body
                bodyspan = (last, len(doc.tokens_map))

                header_tag = SemanticTag('headline', header_text, headerspan)
                body_tag = SemanticTag('paragraphBody', None, bodyspan)

                print(header_tag)
                # print(body_tag)
                para = Paragraph(header_tag, body_tag)
                doc.paragraphs.append(para)
                last = len(doc.tokens_map)

                h_subdoc = doc.subdoc_slice(para.header.as_slice())
                b_subdoc = doc.subdoc_slice(para.body.as_slice())
                # self.assertEqual(self.n(header_text), h_subdoc.text)
                # self.assertEqual(self.n(body_text), b_subdoc.text)

        print('-' * 100)
        print(doc.text)

        headers = [
            doc.subdoc_slice(p.header.as_slice()) for p in doc.paragraphs
        ]
        print('-' * 100)
Пример #2
0
def nn_get_subject(textmap: TextMap, semantic_map: DataFrame,
                   subj_1hot) -> SemanticTag:
    predicted_subj_name, confidence, _ = decode_subj_prediction(subj_1hot)

    tag = SemanticTag('subject', predicted_subj_name.name, span=None)
    tag.confidence = confidence

    tag_ = nn_get_tag_value('subject', textmap, semantic_map)
    if tag_ is not None:
        tag.span = tag_.span

    return tag
Пример #3
0
def nn_get_tag_value(tagname: str,
                     textmap: TextMap,
                     semantic_map: DataFrame,
                     threshold=0.3) -> SemanticTag or None:
    att = semantic_map[tagname].values
    slices = find_top_spans(att, threshold=threshold,
                            limit=1)  # TODO: estimate per-tag thresholds

    if len(slices) > 0:
        span = slices[0].start, slices[0].stop
        value = textmap.text_range(span)
        tag = SemanticTag(tagname, value, span)
        tag.confidence = float(att[slices[0]].mean())
        return tag
    return None
Пример #4
0
def find_document_date(doc: LegalDocument,
                       tagname='date') -> SemanticTag or None:
    head: LegalDocument = get_doc_head(doc)
    c_span, _date = find_date(head.text)
    if c_span is None:
        return None
    span = head.tokens_map.token_indices_by_char_range(c_span)
    return SemanticTag(tagname, _date, span)
Пример #5
0
def find_document_number_in_subdoc(doc: LegalDocument,
                                   tagname='number',
                                   parent=None) -> [SemanticTag]:
    ret = []
    findings = re.finditer(document_number_c, doc.text)
    if findings:
        for finding in findings:
            _number = finding['number']
            if is_number_valid(_number):
                span = doc.tokens_map.token_indices_by_char_range(
                    finding.span())
                tag = SemanticTag(tagname, _number, span, parent=parent)
                tag.offset(doc.start)
                ret.append(tag)
            else:
                print('invalid', _number)

    return ret
Пример #6
0
    def test_contract_analyze(self):
        doc, factory, ctx = self._get_doc_factory_ctx()
        doc.__dict__['number'] = None  # hack for old pickles
        doc.__dict__['date'] = None  # hack for old pickles
        doc.__dict__['attributes_tree'] = ContractSchema(
        )  # hack for old pickles

        ctx.find_attributes(doc, AuditContext())
        tags: [SemanticTag] = doc.get_tags()

        _tag = SemanticTag.find_by_kind(tags,
                                        ContractTags.Value.display_string)
        quote = doc.tokens_map.text_range(_tag.span)
        self.assertEqual('80000,00', quote)

        _tag = SemanticTag.find_by_kind(tags,
                                        ContractTags.Currency.display_string)
        quote = doc.tokens_map.text_range(_tag.span)
        self.assertEqual('рублей', quote)
Пример #7
0
def find_document_number(doc: LegalDocument,
                         tagname='number') -> SemanticTag or None:
    head: LegalDocument = get_doc_head(doc)

    _number, finding_span = find_document_number_span(head.text)
    if _number is not None:
        span = head.tokens_map.token_indices_by_char_range(finding_span)
        return SemanticTag(tagname, _number, span)

    return None
Пример #8
0
def find_charter_org(charter: LegalDocument) -> [SemanticTag]:
    """
  TODO: see also find_protocol_org
  :param charter:
  :return:
  """
    ret = []
    x: [SemanticTag] = find_org_names(
        charter[0:HyperParameters.protocol_caption_max_size_words],
        max_names=1)
    nm = SemanticTag.find_by_kind(x, 'org-1-name')
    if nm is not None:
        ret.append(nm)
    else:
        charter.warn(ParserWarnings.org_name_not_found)

    tp = SemanticTag.find_by_kind(x, 'org-1-type')
    if tp is not None:
        ret.append(tp)
    else:
        charter.warn(ParserWarnings.org_type_not_found)

    return ret
Пример #9
0
    def asLegalDoc(self):

        if self.is_analyzed():
            # attributes are bound to an existing tokens map
            # -->  preserve saved tokenization
            doc = create_doc_by_type(self.parse['documentType'],
                                     self._id,
                                     filename=self.filename)

            doc.tokens_map_norm = self.get_tokens_for_embedding()
            doc.tokens_map = self.get_tokens_map_unchaged()
            if 'sentence_map' in doc.__dict__:
                doc.sentence_map = self.get_sentence_map()
                if doc.sentence_map is None:
                    doc.split_into_sentenses()

            headers = self.analysis.get('headers', None)
            if headers is not None:
                doc.paragraphs = []
                last = len(doc.tokens_map)
                for i, h in enumerate(headers):
                    header_tag = SemanticTag('headline', h['value'], h['span'])
                    body_end = last
                    if i < len(headers) - 1:
                        body_end = headers[i + 1]['span'][0]
                    bodyspan = header_tag.span[1] + 1, body_end
                    body_tag = SemanticTag('paragraphBody', None, bodyspan)

                    para = Paragraph(header_tag, body_tag)
                    doc.paragraphs.append(para)
        else:
            # re-combine parser data
            doc = join_paragraphs(self.parse, self._id, filename=self.filename)
            pass

        doc.user = self.user
        return doc
Пример #10
0
def find_org_names_raw_by_re(doc: LegalDocument, regex, confidence_base: float, parent=None,
                             decay_confidence=True) -> [ContractAgent]:
  all_: [ContractAgent] = []

  iter = [m for m in re.finditer(regex, doc.text)]

  for m in iter:
    ca = ContractAgent()
    all_.append(ca)
    for re_kind in org_pieces:  # like 'type', 'name', 'human_name', 'alt_name', 'alias' ...
      try:
        char_span = m.span(re_kind)
        if span_len(char_span) > 1:
          span = doc.tokens_map.token_indices_by_char_range(char_span)
          confidence = confidence_base
          if decay_confidence:
            confidence *= (1.0 - (span[0] / len(doc)))

          kind = re_kind
          if re_kind == 'human_name':
            kind = 'name'

          val = doc.tokens_map.text_range(span)
          val = val.strip()
          if _is_valid(val):
            tag = SemanticTag(kind, val, span, parent=parent)
            tag.confidence = confidence
            tag.offset(doc.start)
            ca.__dict__[kind] = tag
      except IndexError:
        pass

  # normalize org_name names by find_closest_org_name
  for ca in all_:
    normalize_contract_agent(ca)

  return all_
Пример #11
0
 def as_tag(self):
     st = SemanticTag(self.type, None, (self.subdoc.start, self.body.end))
     st.confidence = self.confidence
     return st
Пример #12
0
 def tag_val(name):
     tag = SemanticTag.find_by_kind(tags, name)
     if tag is not None:
         return tag.value