def tagging(self, filename, outputfile): ''' if this is not text convert it to text file ''' if (filename.endswith(".txt")): inputfilename = filename else: inputfilename = docutonelocate.convert_file(filename) documents = self._read_file(inputfilename) self.outputfile = outputfile self.file_tagging(documents)
def file_clean(self, filename): from docutone.core.document import LawDocument if (filename.endswith(".txt")): ofile = filename else: ofile = docutonelocate.convert_file(filename) lawdoc = LawDocument() document = lawdoc.get_fusion_document(ofile) for sentence in document: print(' '.join(sentence))
def file_named_tag(self, filename): from docutone.core.document import LawDocument if (filename.endswith(".txt")): ofile = filename else: ofile = docutonelocate.convert_file(filename) lawdoc = LawDocument() document = lawdoc.get_fusion_document(ofile) self.new_ner = {} for sentence in document: self.get_sentence_named_tag(sentence) self.write_ner()
def create_test_tagging(self, filename, ftype): self.fullname = filename self.filename = os.path.basename(filename).split('.')[0] if ";" in ftype: parent, cat = ftype.split(";", 1) self.categorie = cat.strip() self.ftype = parent.strip() else: if '(' in ftype: index = ftype.find('(') elif '(' in ftype: index = ftype.find('(') else: index = 0 if index > 3: self.categorie = ftype[0:index].strip() else: self.categorie = ftype.strip() self.ftype = None ''' if this is not text convert it to text file ''' if (filename.endswith(".txt")): inputfilename = filename else: inputfilename = docutonelocate.convert_file(filename) self.dtn_doc = None if self.categorie == dtn_document.LABOR_CONTRACT: self.dtn_doc = labor_contract.LaborContract() elif self.categorie == dtn_document.LOAN_AGREEMENT: self.dtn_doc = loan_agreement.LoanAgreement() elif self.categorie == dtn_document.TRANSFER_AGREEMENT: self.dtn_doc = transfer_agreement.TransferAgreement() else: self.dtn_doc = other_document.OtherDocument() documents = self.dtn_doc.read(inputfilename) self._add_new_clauses(documents, self.dtn_doc._results) result = self.tagging.tagging_test(documents) return result
def _load_document_clauses(self, filename, label): if (filename.endswith(".txt")): ofile = filename else: ofile = docutonelocate.convert_file(filename) self._clause.create_clauses(ofile) if len(self._clause.sections) > 0: for section in self._clause.sections: name = label + ":" + section.title name = section.title if name in self.labels_index.keys(): label_id = self.labels_index[name] else: label_id = len(self.labels_index) self.labels_index[name] = label_id self.label_name.append(name) self.labels.append(label_id) words = section.toWords() self.texts.append(words)
def load_predict_document(self, filename): if (filename.endswith(".txt")): ofile = filename else: ofile = docutonelocate.convert_file(filename) self._document.read_section(ofile) texts = [] if len(self._document.sections) > 0: for section in self._document.sections: ss = [] if section.title: pass if len(section.sentences) > 0: ss = [p[0] for p in section.sentences] if len(ss) > 0: texts.append(doc.sentencesTowords(ss)) else: for s in self._document.document_header: texts.append(doc.sentencesTowords([s])) return texts
def get_terms(self, filename, filetype): if (filename.endswith(".txt")): ofile = filename else: ofile = docutonelocate.convert_file(filename) #lawdocument.create_document(ofile, filetype) self.document.read_section(ofile) self._title = self.document.document_name self._contract_date = self.document.document_date if self._title: if '文件名称' in self.keywords: self.verified_terms['文件名称'].add_value(self._title, 1) elif '合同名称' in self.keywords: self.verified_terms['合同名称'].add_value(self._title, 1) if self._contract_date: if '签约日期' in self.keywords: self.verified_terms['签约日期'].add_value(self._contract_date, 1) elif '签发日期' in self.keywords: self.verified_terms['签发日期'].add_value(self._contract_date, 1) elif '合同日期' in self.keywords: self.verified_terms['合同日期'].add_value(self._contract_date, 1) terms = [] ''' prev_sentence = '' for s in ld.document_header : prev_sentence += s if ld._is_sentence_end(s) : terms.append([prev_sentence]) prev_sentence = '' if prev_sentence : terms.append([prev_sentence]) ''' nb = len(self.document.sections) if nb > 0: index = 0 while index < nb: p = self.document.sections[index] index += 1 ''' if section title = term name add it to verfied table ''' if p.title: termname = dtn_sentence.get_keywords_by_name( p.title, self.keywords) if termname: if len(p.sentences) > 0: for s in p.sentences: if isinstance(s, str): self.verified_terms[termname].add_value( s, 1) else: s_line = s[0] self._add_verified_sentences( termname, s[1], s_line[-1], 1) while index < nb: sp = self.document.sections[index] index += 1 if sp.level > p.level: for s in sp.sentences: if isinstance(s, str): self.verified_terms[ termname].add_value(s, 1) else: s_line = s[0] self._add_verified_sentences( termname, s[1], s_line[-1], 1) else: ''' back to prev section ''' index -= 1 break if len(p.sentences) > 0: terms.append(p.sentences) return terms
commd += " -i " + infile if o_file: commd += " -o " + o_file if options.doc_type: commd += " -t " + options.doc_type dtn_logger.logger_info("MAIN", commd) if options.action == 'convert': ''' conv = Convert(verbose=verbose, restart=options.restart) o_file = conv.open_output(infile, o_file) conv.files_to_text(infile, o_file) conv.close_output() ''' ofile = docutonelocate.convert_file(infile, True) elif options.action == 'testfile': conv = Convert(verbose=verbose, restart=options.restart) conv.test_files_in_directory(infile, o_file) elif options.action == 'changebad': conv = Convert(verbose=verbose, restart=options.restart) conv.change_root_bad_files(infile, o_file) elif options.action == 'text4sentences':