Python LawDocument.read_section示例

class TermsVerification(object):

    SIMU_SEUIL = 0.6

    def __init__(self):

        self.contract = Contract(0)

        self.verified_terms = {}
        self._filetime = None
        self.fullname = None
        self.filename = None
        self._title = None
        self._contract_date = None
        self.keywords = []

        self.segment = Segmentation()
        self.document = LawDocument()

    def _init_terms_table(self, filename, termtype):
        self.categorie = termtype
        # get file name

        self.fullname = filename
        self.filename = os.path.basename(filename).split('.')[0]
        # get file created date
        self._filetime = util.get_creation_file_date(filename)

        # init verfying tab
        self.verified_terms = {}
        self.keywords = dtn_sentence.get_document_categorie(termtype)
        for key in self.keywords:
            self.verified_terms[key] = ExtractData(key, termtype)

        dtn_logger.logger_info("VERIFY", "%s (%s)" % (filename, termtype))

    def _load_terms_model(self, doctype=None):

        self.contract.doc_path = doctype

        self.term_names = self.contract.load_term_label()
        self.term_set = self.contract.load_term_set()
        self.term_list = self.contract.load_term_list()

        fname = self.contract.get_term_model_name()
        self.model = doc2vec.Doc2Vec.load(fname)

    def similar_term(self, term_words, termtype):

        tname = None
        ttype = None
        simu = 0.0
        docvec = self.model.infer_vector(doc_words=term_words)
        sims = self.model.docvecs.most_similar(positive=[docvec], topn=5)

        for i in range(len(sims)):
            n_term = int(sims[i][0])
            f_simu = sims[i][1]
            if f_simu > self.SIMU_SEUIL:
                if (n_term >= len(self.term_list)):
                    continue
                '''term = self.term_names[self.term_set[n_term]-1]'''
                term_name = self.term_list[n_term]
                if ':' in term_name:
                    tab = term_name.split(':', 1)
                    if tab[1] == termtype:
                        if tname == None:
                            tname = tab[0]
                            ttype = tab[1]
                            simu = f_simu
                            break

                elif term_name == termtype:
                    tname = term_name
                    ttype = term_name
                    simu = f_simu
                    break

            else:
                break

        return tname, ttype, simu

    def verify_term(self, text):

        term_words = self.contract.get_term_words(text)

        return self.similar_term(term_words)

    def _add_verified_sentences(self, termname, n_start, end_char, simu):

        nl = n_start
        st = self.document.norm_sentences[nl]
        ps = self.document.parser_sentence(st)
        if ps:
            st = ps[1]
            if ps[1][-1] is not ' ' and ps[2][0] is not ' ':
                st += ' '
            st += ps[2]

        self.verified_terms[termname].add_value(st, simu)
        while len(st) == 0 or st[-1] != end_char:
            nl += 1
            st = self.document.norm_sentences[nl]
            self.verified_terms[termname].add_value(st, 1)

    ''' get document term '''

    def get_terms(self, filename, filetype):

        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)

        #lawdocument.create_document(ofile, filetype)
        self.document.read_section(ofile)

        self._title = self.document.document_name
        self._contract_date = self.document.document_date
        if self._title:
            if '文件名称' in self.keywords:
                self.verified_terms['文件名称'].add_value(self._title, 1)
            elif '合同名称' in self.keywords:
                self.verified_terms['合同名称'].add_value(self._title, 1)
        if self._contract_date:
            if '签约日期' in self.keywords:
                self.verified_terms['签约日期'].add_value(self._contract_date, 1)
            elif '签发日期' in self.keywords:
                self.verified_terms['签发日期'].add_value(self._contract_date, 1)
            elif '合同日期' in self.keywords:
                self.verified_terms['合同日期'].add_value(self._contract_date, 1)

        terms = []
        '''
        prev_sentence = ''
        for s in ld.document_header :
            prev_sentence += s
            if ld._is_sentence_end(s) :
                terms.append([prev_sentence])
                prev_sentence = ''
        if prev_sentence :
            terms.append([prev_sentence])
        '''
        nb = len(self.document.sections)
        if nb > 0:
            index = 0
            while index < nb:
                p = self.document.sections[index]
                index += 1
                ''' if section title = term name add it to verfied table '''
                if p.title:
                    termname = dtn_sentence.get_keywords_by_name(
                        p.title, self.keywords)
                    if termname:
                        if len(p.sentences) > 0:
                            for s in p.sentences:
                                if isinstance(s, str):
                                    self.verified_terms[termname].add_value(
                                        s, 1)
                                else:
                                    s_line = s[0]
                                    self._add_verified_sentences(
                                        termname, s[1], s_line[-1], 1)

                        while index < nb:
                            sp = self.document.sections[index]
                            index += 1
                            if sp.level > p.level:
                                for s in sp.sentences:
                                    if isinstance(s, str):
                                        self.verified_terms[
                                            termname].add_value(s, 1)
                                    else:
                                        s_line = s[0]
                                        self._add_verified_sentences(
                                            termname, s[1], s_line[-1], 1)
                            else:
                                ''' back to prev section '''
                                index -= 1
                                break

                if len(p.sentences) > 0:
                    terms.append(p.sentences)

        return terms

    def _verified_clauses(self, filename, termtype):

        terms = self.get_terms(filename, termtype)
        for term in terms:
            sentences = [s[0] for s in term]

            n_start = term[0][1]
            end_char = sentences[-1][-1]

            term_words = self.contract.get_term_words(sentences)

            tname, ttype, simu = self.similar_term(term_words, termtype)
            if ttype != None and tname != None:
                if ttype == termtype:
                    if tname in self.verified_terms.keys():
                        '''
                        for s in sentences :
                            self.verified_terms[tname].add_value(s, simu)
                        '''
                        self._add_verified_sentences(tname, n_start, end_char,
                                                     simu)

    def create_contract_model(self, fpath):

        self.contract.create_terms(fpath)

    def get_contract_date(self):

        return time.strftime("%Y-%m-%d   %H:%M:%S",
                             time.gmtime(self._filetime))

    def verify_document(self, filename, doctype, termtype):

        # init clause table
        self._init_terms_table(filename, termtype)

        # load lagal terms training model
        self._load_terms_model(doctype)

        self._verified_clauses(filename, termtype)

        sorted_list = []

        for key in self.keywords:
            if key in self.verified_terms.keys():
                term = self.verified_terms[key].term_value
                if (len(term) > 0):
                    sorted_list.append((key, 1, term))
                else:
                    sorted_list.append((key, 0))

        return sorted_list

    def _to_html_text(self, term_list):

        lists = []
        for elem in term_list:
            if len(elem) == 3:
                name, _, data = elem
            else:
                continue

            text = ""
            if len(data) > 0:
                for v, s_simu in data:
                    if s_simu > 0:  # is term name and find term string
                        s = dtn_sentence.get_sentence(v)
                        ss = dtn_document.law_document.parser_sentence(s)
                        text += '<p>'
                        if ss:
                            text += '<b>' + ss[1] + ' ' + ss[2] + '</b></p>'
                            text += '<p>'  # empty line
                        else:
                            text += s
                        text += '</p>'

                lists.append([name, text])
            else:
                lists.append([name, text])

        return lists

    def to_json(self, term_list):
        result = {}
        '''
        result["FILE"] = [self.fullname]
        result["TEMPS"] = [str(self._filetime)]
        result["TTILE"] = [self._title]
        '''

        result["filename"] = [self.filename, self.fullname, self.categorie]
        result["result"] = self._to_html_text(term_list)
        #result["result"] = self._to_list(lists)

        docutonejson.print_json(result)

    def example0(self):

        fname = config.TEST_PATH + "/劳动合同/Chanel劳动合同.docx.txt"
        ftype = "劳动合同"
        term_list = self.verify_document(fname, None, ftype)
        self.to_json(term_list)

    def example1(self):

        fname = config.TEST_PATH + "/章程/华能国际电力股份有限公司章程.pdf.txt"
        ftype = "有限责任公司章程"
        term_list = self.verify_document(fname, None, ftype)
        self.to_json(term_list)

    def example2(self):

        fname = config.TEST_PATH + "/章程/华能国际电力股份有限公司章程.docx.txt"
        ftype = "有限责任公司章程"
        term_list = self.verify_document(fname, None, ftype)
        self.to_json(term_list)

示例#2

显示文件

文件： clause_verifying.py 项目： minlogiciel/docutone

class ClauseVerifying(object):
    def __init__(self, modelname=None):

        self._document = LawDocument()
        self.clause_model = clause_training.ClauseTraining()
        self.clause_model.load_model_label()

        pass

    def load_predict_document(self, filename):
        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)
        self._document.read_section(ofile)

        texts = []
        if len(self._document.sections) > 0:
            for section in self._document.sections:
                ss = []
                if section.title:
                    pass
                if len(section.sentences) > 0:
                    ss = [p[0] for p in section.sentences]
                    if len(ss) > 0:
                        texts.append(doc.sentencesTowords(ss))

        else:
            for s in self._document.document_header:
                texts.append(doc.sentencesTowords([s]))

        return texts

    def predict(self, filename):

        texts = self.load_predict_document(filename)

        tokenizer = Tokenizer(num_words=self.clause_model.MAX_NB_WORDS)
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)

        # create data
        data = pad_sequences(sequences,
                             maxlen=self.clause_model.MAX_SEQUENCE_LENGTH)

        dtn_logger.logger_info("PREDICT",
                               "Verification document : " + filename)
        dtn_logger.logger_info("PREDICT", "Predict Data : " + str(data.shape))

        model = md.load_json_model(self.clause_model.MODEL_NAME)
        #model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        model.compile(loss='binary_crossentropy',
                      optimizer=md.OPTIMIZER_ADAM,
                      metrics=['accuracy'])

        for i, s in enumerate(data):
            s = data[np.array([i])]
            preds = model.predict(s)

            n = self.sample(preds[0])
            print("*** " + self.clause_model.label_name[n] + "***")
            n = self.sample(preds[0], 0.8)
            print("*** " + self.clause_model.label_name[n] + "***")
            n = self.sample(preds[0], 0.2)
            print("*** " + self.clause_model.label_name[n] + "***")

            print(texts[i])
            if i > 5:
                break

    def sample(self, p, temperature=1.0):
        # helper function to sample an index from a probability array
        preds = np.asarray(p).astype('float64')
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        probas = np.random.multinomial(1, preds, 1)

        mmm = np.argmax(probas)
        print(mmm)
        return mmm