def __init__(self, modelname=None): self._document = LawDocument() self.clause_model = clause_training.ClauseTraining() self.clause_model.load_model_label() pass
def __init__(self, filename=None): """ """ self.law_document = LawDocument() self.important_word = [] self.top_n_scored = [] self.mean_scored = []
def get_document_type(self, dictname) : textfname = "../dictionary/text/" + dictname + ".txt" law_document = LawDocument() law_document.analyze(filename=textfname) text = law_document.document_type; return text
def get_document(self, dictname) : textfname = "../dictionary/text/" + dictname + ".txt" law_document = LawDocument() law_document.analyze(filename=textfname) text = "\n".join(law_document.document_title); return text
def file_clean(self, filename): from docutone.core.document import LawDocument if (filename.endswith(".txt")): ofile = filename else: ofile = docutonelocate.convert_file(filename) lawdoc = LawDocument() document = lawdoc.get_fusion_document(ofile) for sentence in document: print(' '.join(sentence))
def __init__(self): self.texts = [] # list of text samples self.labels_index = {} # dictionary mapping label name to numeric id self.labels_files = {} # dictionary mapping label name to numeric id self.labels_name = {} # dictionary mapping label name to numeric id self.file_label = [] # file label id self.labels = [] # list of label ids self.classifiers = [] # list of classifier self.law_doc = LawDocument() self.folder_structure = {} self.folder_order = [] pass
def file_named_tag(self, filename): from docutone.core.document import LawDocument if (filename.endswith(".txt")): ofile = filename else: ofile = docutonelocate.convert_file(filename) lawdoc = LawDocument() document = lawdoc.get_fusion_document(ofile) self.new_ner = {} for sentence in document: self.get_sentence_named_tag(sentence) self.write_ner()
def __init__(self): self.contract = Contract(0) self.verified_terms = {} self._filetime = None self.fullname = None self.filename = None self._title = None self._contract_date = None self.keywords = [] self.segment = Segmentation() self.document = LawDocument()
def __init__(self, debug=0, crf_model=True): self.texts = [] # list of legal terms tests self.terms_index = {} # mapping legal term name to numeric id self.terms_name = {} # legal term name self.terms_label = [] # mapping legal term name to label self.labels = [] # list of legal term label ids self._debug = debug self.seg = Segmentation() self.seg.load_suggest_words() self.lawdocument = LawDocument() self.clause = Clause() self.doc_type = None self.doc_path = None self.labor_model = True self.crf_model = crf_model
def __init__(self, modelname=None): self.MAX_SEQUENCE_LENGTH = 1000 self.MAX_NB_WORDS = 20000 self.EMBEDDING_DIM = 100 self.VALIDATION_SPLIT = 0.25 self.EPOCHS = 64 self.BATCH_SIZE = 32 self.POOL_SIZE = 5 self.FILTERS = 64 self.LSTM_OUTPUT_SIZE = 70 if modelname == None: self.MODEL_NAME = "clause_model" else: self.MODEL_NAME = modelname self._document = LawDocument() self._clause = Clause() self.texts = [] # list of text samples self.labels_index = {} # dictionary mapping label name to numeric id self.labels = [] # list of label ids self.label_name = [] self._debug = 1 self._save_model = False
def __init__(self, stopwords_file=None): """ Keyword arguments: stopwords_file : stopwords file name """ self.pagerank_config = { 'alpha': 0.85, } self.seg = Segmentation(stopwords_file=stopwords_file) self.law_document = LawDocument() self.sentences = None self.words_no_filter = None # 2维列表 self.words_no_stop_words = None self.words_all_filters = None self.key_sentences = None
def __init__(self, filename=None): self.MAX_SEQUENCE_LENGTH = 1000 self.MAX_NB_WORDS = 20000 self.EMBEDDING_DIM = 100 self.VALIDATION_SPLIT = 0.25 self.embeddings_index = self.load_embedding_base() self._document = LawDocument() self.label_name = [] self.texts = [] # list of text samples self.labels_index = {} # dictionary mapping label name to numeric id self.labels = [] # list of label ids self._debug = 1 pass
def get_document_chapiter(self, sims, dictname) : textfname = "../dictionary/text/" + dictname + ".txt" law_document = LawDocument() law_document.analyze(filename=textfname) text = ""; n_line = 1 for sim in sims : doc_no, simil = sim[0], sim [1] if (simil > 0.4) : text += "******** " + str(n_line) + " ********\n" text += law_document.get_document_chapiter(doc_no) + "\n" n_line += 1 if n_line > 2: break; else : break return text
def search_document(self, textpath, filename) : ld = LawDocument() ld.analyze(filename=filename) doc_tab = [] names = os.listdir("../dictionary/dict") n_file = 1 for filename in os.listdir("../dictionary/dict") : if filename.endswith(".dict") : dictname = filename.replace('.dict', '') total = 0.0 sentences = [] for sentence in ld.table_contents : if len(sentence) > 1 : sims = self.text_search_lsi(textpath, sentence[1]) total += self.get_similarity_value(sims) doc_tab.append([dictname, total]) doc_tab = sorted(doc_tab, key=lambda total: total[1], reverse=True) return self.get_document_type(doc_tab[0][0])
def __init__(self): self.law_doc = LawDocument() self.file_index = 1 self.folder_structure = {} self.folder_order = [] self.corpus_document = [] instance = Terms() self.categories = instance.get_all_term_items() pass
class EnbedTraining(object): def __init__(self, filename=None): self.MAX_SEQUENCE_LENGTH = 1000 self.MAX_NB_WORDS = 20000 self.EMBEDDING_DIM = 100 self.VALIDATION_SPLIT = 0.25 self.embeddings_index = self.load_embedding_base() self._document = LawDocument() self.label_name = [] self.texts = [] # list of text samples self.labels_index = {} # dictionary mapping label name to numeric id self.labels = [] # list of label ids self._debug = 1 pass def load_embedding_base(self): embeddings_index = {} f = codecs.open( os.path.join(variables.BASE_DIR, 'data/document_classification.txt'), 'r', 'utf-8') for line in f: values = line.split() if len(values) > 2: word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() return embeddings_index def _load_directory(self, path, label, label_id): for fname in sorted(os.listdir(path)): fpath = os.path.join(path, fname) if os.path.isdir(fpath): self._load_directory(fpath, label, label_id) elif fname.endswith(".txt"): words = self._document.get_normalize_document(fpath, outtype=0) if len(words) > 0: self.texts.append(words) self.classifiers.append(label) self.labels.append(label_id) self.file_label.append(fname) def load_data(self, path): self.label_name = [] self.texts = [] # list of text samples self.labels_index = {} # dictionary mapping label name to numeric id self.labels = [] # list of label ids self.classifiers = [] self.file_label = [] for fname in os.listdir(path): fpath = os.path.join(path, fname) if os.path.isdir(fpath): label_id = len(self.labels_index) self.labels_index[fname] = label_id self._load_directory(fpath, fname, label_id) # tokenizer tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS) tokenizer.fit_on_texts(self.texts) self.sequences = tokenizer.texts_to_sequences(self.texts) self.word_index = tokenizer.word_index def create_traning_data(self): labels = to_categorical(np.asarray(self.labels)) # create data data = pad_sequences(self.sequences, maxlen=self.MAX_SEQUENCE_LENGTH) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] nb_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0]) # training size and values x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:] return x_train, y_train, x_val, y_val def preparing_matrix(self): # prepare embedding matrix nb_words = min(self.MAX_NB_WORDS, len(self.word_index)) embedding_matrix = np.zeros((nb_words + 1, self.EMBEDDING_DIM)) for word, i in self.word_index.items(): if i > self.MAX_NB_WORDS: continue embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector # load pre-trained word embeddings into an Embedding layer # note that we set trainable = False so as to keep the embeddings fixed embedding_layer = Embedding(nb_words + 1, self.EMBEDDING_DIM, weights=[embedding_matrix], input_length=self.MAX_SEQUENCE_LENGTH, trainable=False) return embedding_layer def LSTMTraining(self, x_train, y_train, x_val, y_val): model = Sequential() # 词向量嵌入层,输入:词典大小,词向量大小,文本长度 model.add( Embedding(self.MAX_SEQUENCE_LENGTH, 100, input_length=self.MAX_NB_WORDS)) #model.add(Dropout(0.25)) model.add(LSTM(100)) #model.add(Flatten()) model.add(Convolution1D(128, 5, border_mode="valid", activation="relu")) # 全连接层 model.add(Dense(128)) model.add(Dropout(0.25)) model.add(Activation('relu')) model.add(Dense(128)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'], verbose=1) result = model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=2, batch_size=128, verbose=1) def test(self): # input: meant to receive sequences of 100 integers, between 1 and 10000. main_input = Input(shape=(100, ), dtype='int32', name='main_input') # this embedding layer will encode the input sequence # into a sequence of dense 512-dimensional vectors. x = Embedding(output_dim=512, input_dim=10000, input_length=100)(main_input) # LSTM will transform the vector sequence into a single vector, # containing information about the entire sequence lstm_out = LSTM(32)(x) #insert the auxiliary loss, allowing the LSTM and Embedding layer to be trained #smoothly even though the main loss will be much higher in the model. auxiliary_output = Dense(1, activation='sigmoid', name='aux_output')(lstm_out) #we feed into the model our auxiliary input data by concatenating it with the LSTM output: auxiliary_input = Input(shape=(5, ), name='aux_input') #x = merge([lstm_out, auxiliary_input], mode='concat') # we stack a deep fully-connected network on top x = Dense(64, activation='relu')(x) x = Dense(64, activation='relu')(x) x = Dense(64, activation='relu')(x) # and finally we add the main logistic regression layer main_output = Dense(1, activation='sigmoid', name='main_output')(x) #This defines a model with two inputs and two outputs: model = Model(input=[main_input, auxiliary_input], output=[main_output, auxiliary_output]) model.compile(optimizer='rmsprop', loss='binary_crossentropy', loss_weights=[1., 0.2]) #We can train the model by passing it lists of input arrays and target arrays: ''' model.fit([headline_data, additional_data], [labels, labels], nb_epoch=50, batch_size=32) model.compile(optimizer='rmsprop', loss={'main_output': 'binary_crossentropy', 'aux_output': 'binary_crossentropy'}, loss_weights={'main_output': 1., 'aux_output': 0.2}) # and trained it via: model.fit({'main_input': headline_data, 'aux_input': additional_data}, {'main_output': labels, 'aux_output': labels}, nb_epoch=50, batch_size=32) ''' def clause_training(self, path): #from keras.utils.vis_utils import plot_model # 1, loading text samples and their labels self.load_data(path) # 2. create data into a training set and a validation set x_train, y_train, x_val, y_val = self.create_traning_data() # 4. create embedding layer embedding_layer = self.preparing_matrix() # 5. train a 1D convnet with global maxpooling sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH, ), dtype='int32', name='main_input') embedded_sequences = embedding_layer(sequence_input) #lstm_out = LSTM(32)(embedded_sequences) #x = MaxPooling1D(5)(lstm_out) x = Conv1D(128, 5, activation='relu')(embedded_sequences) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(35)(x) x = Flatten()(x) x = Dense(128, activation='relu')(x) preds = Dense(len(self.labels_index), activation='softmax')(x) model = Model(sequence_input, preds) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) #plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True) ''' history = model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=2, batch_size=128) # Estimate model performance #trainScore = model.evaluate(x_train, y_train, verbose=0) #print('Train Score: %.2f MSE (%.2f RMSE)' % (trainScore, math.sqrt(trainScore))) #testScore = model.evaluate(x_val, y_val, verbose=0) #print('Test Score: %.2f MSE (%.2f RMSE)' % (testScore, math.sqrt(testScore))) # generate predictions for training trainPredict = model.predict(x_train) testPredict = model.predict(x_val) ''' md.save_json_model(model, "clause_model") #md.save_yaml_model(model, "embedded_model") return model def loading(self): from keras.datasets import imdb from keras.preprocessing import sequence max_features = 5000 maxlen = 400 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) if self._debug: print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) if self._debug: print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) return x_train, y_train, x_test, y_test def training_model(self): max_features = 5000 maxlen = 400 batch_size = 32 embedding_dims = 50 filters = 250 kernel_size = 3 hidden_dims = 250 epochs = 2 model = Sequential() '''we start off with an efficient embedding layer which maps our vocab indices into embedding_dims dimensions''' model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) model.add(Dropout(0.2)) '''we add a Convolution1D, which will learn filters word group filters of size filter_length: ''' model.add( Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1)) # we use max pooling: model.add(GlobalMaxPooling1D()) # We add a vanilla hidden layer: model.add(Dense(hidden_dims)) model.add(Dropout(0.2)) model.add(Activation('relu')) # We project onto a single unit output layer, and squash it with a sigmoid: model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) x_train, y_train, x_test, y_test = self.loading() model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test)) md.save_json_model(model, "json_model") md.save_yaml_model(model, "yaml_model") del model def test_imdb(self): from keras.callbacks import ModelCheckpoint from keras.utils import np_utils law_document = LawDocument() fname = os.path.join( variables.BASE_DIR, 'data/Corpus/TEXT/合同、协议/劳动合同/1. 劳动合同- 最终版.DOC.txt') sentences = law_document.get_sentences(fname) # tokenizer tokenizer = Tokenizer(nb_words=self.MAX_NB_WORDS) tokenizer.fit_on_texts([sentences]) self.sequences = tokenizer.texts_to_sequences([sentences]) self.word_index = tokenizer.word_index seq_length = 10 data = [m for m in self.word_index.values()] index_word = {} for w, id in list(self.word_index.items()): index_word[id] = w dataX = [] dataY = [] length = len(data) - seq_length for i in range(0, length, seq_length): seq_in = data[i:i + seq_length - 1] seq_out = data[i + seq_length] dataX.append(seq_in) dataY.append(seq_out) """ raw_text = sentences chars = sorted(list(set("word telphone main"))) # create mapping of unique chars to integers chars = sorted(list(set(raw_text))) char_to_int = dict((c, i) for i, c in enumerate(chars)) int_to_char = dict((i, c) for i, c in enumerate(chars)) # summarize the loaded data n_chars = len(raw_text) n_vocab = len(chars) print ("Total Characters: ", n_chars) print ("Total Vocab: ", n_vocab) # prepare the dataset of input to output pairs encoded as integers seq_length = 100 dataX = [] dataY = [] for i in range(0, n_chars - seq_length, 1): seq_in = raw_text[i:i + seq_length] seq_out = raw_text[i + seq_length] dataX.append([char_to_int[char] for char in seq_in]) dataY.append(char_to_int[seq_out]) """ n_patterns = len(dataX) print("Total Patterns: ", n_patterns) # reshape X to be [samples, time steps, features] X = np.reshape(dataX, (n_patterns, seq_length - 1, 1)) n_vocab = len(index_word) # normalize X = X / float(n_vocab) # one hot encode the output variable y = np_utils.to_categorical(dataY) # define the LSTM model model = Sequential() model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]))) model.add(Dropout(0.2)) model.add(Dense(y.shape[1], activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') # define the checkpoint filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] # fit the model #model.fit(X, y, nb_epoch=20, batch_size=128, callbacks=callbacks_list) start = np.random.randint(0, len(dataX) - 1) pattern = dataX[start] print("Seed:") print("\"", ''.join([index_word[value] for value in pattern]), "\"") # generate characters for i in range(1000): x = np.reshape(pattern, (1, len(pattern), 1)) x = x / float(n_vocab) prediction = model.predict(x, verbose=0) index = np.argmax(prediction) result = index_word[index] seq_in = [index_word[value] for value in pattern] sys.stdout.write(result) pattern.append(index) pattern = pattern[1:len(pattern)] print("\nDone.") '''
class ClauseVerifying(object): def __init__(self, modelname=None): self._document = LawDocument() self.clause_model = clause_training.ClauseTraining() self.clause_model.load_model_label() pass def load_predict_document(self, filename): if (filename.endswith(".txt")): ofile = filename else: ofile = docutonelocate.convert_file(filename) self._document.read_section(ofile) texts = [] if len(self._document.sections) > 0: for section in self._document.sections: ss = [] if section.title: pass if len(section.sentences) > 0: ss = [p[0] for p in section.sentences] if len(ss) > 0: texts.append(doc.sentencesTowords(ss)) else: for s in self._document.document_header: texts.append(doc.sentencesTowords([s])) return texts def predict(self, filename): texts = self.load_predict_document(filename) tokenizer = Tokenizer(num_words=self.clause_model.MAX_NB_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) # create data data = pad_sequences(sequences, maxlen=self.clause_model.MAX_SEQUENCE_LENGTH) dtn_logger.logger_info("PREDICT", "Verification document : " + filename) dtn_logger.logger_info("PREDICT", "Predict Data : " + str(data.shape)) model = md.load_json_model(self.clause_model.MODEL_NAME) #model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) model.compile(loss='binary_crossentropy', optimizer=md.OPTIMIZER_ADAM, metrics=['accuracy']) for i, s in enumerate(data): s = data[np.array([i])] preds = model.predict(s) n = self.sample(preds[0]) print("*** " + self.clause_model.label_name[n] + "***") n = self.sample(preds[0], 0.8) print("*** " + self.clause_model.label_name[n] + "***") n = self.sample(preds[0], 0.2) print("*** " + self.clause_model.label_name[n] + "***") print(texts[i]) if i > 5: break def sample(self, p, temperature=1.0): # helper function to sample an index from a probability array preds = np.asarray(p).astype('float64') preds = np.asarray(preds).astype('float64') preds = np.log(preds) / temperature exp_preds = np.exp(preds) preds = exp_preds / np.sum(exp_preds) probas = np.random.multinomial(1, preds, 1) mmm = np.argmax(probas) print(mmm) return mmm
class Datasets(object): ''' create all types of document data set doc2vec label document type save to dataset data/dataset directory ''' NB_LOOP = 7 V_ALPHA = 0.2 V_MIN_ALPHA = 0.05 V_ALPHA_RATE = 0.005 def __init__(self): self.texts = [] # list of text samples self.labels_index = {} # dictionary mapping label name to numeric id self.labels_files = {} # dictionary mapping label name to numeric id self.labels_name = {} # dictionary mapping label name to numeric id self.file_label = [] # file label id self.labels = [] # list of label ids self.classifiers = [] # list of classifier self.law_doc = LawDocument() self.folder_structure = {} self.folder_order = [] pass def get_data_file_name(self, fname, isdataset=True): path = os.path.join(variables.BASE_DIR, variables.MODEL_DATA_DIR) if not os.path.exists(path): os.mkdir(path) if isdataset: path = os.path.join(path, 'datasets') if not os.path.exists(path): os.mkdir(path) return os.path.join(path, fname) def get_svc_file_name(self): return self.get_data_file_name(variables.SVC_MODEL) def get_model_file_name(self): return self.get_data_file_name(variables.DOC_MODEL) def get_word_model_name(self): return self.get_data_file_name(variables.WORD_MODEL) def get_dict_file_name(self): return self.get_data_file_name(variables.MODEL_DICT) def get_mm_file_name(self): return self.get_data_file_name(variables.MODEL_MM) def get_model_list_name(self): return self.get_data_file_name(variables.MODEL_LSI) """ all files in this directory are a classifier document """ def load_directory_for_document(self, path, label, label_id): nb_files = 0 for fname in sorted(os.listdir(path)): fpath = os.path.join(path, fname) if os.path.isdir(fpath): if fname in self.folder_structure.keys(): sublabel = self.folder_structure[fname][0] if sublabel == None: sublabel = label else: sublabel = label nb_files += self.load_directory_for_document( fpath, sublabel, label_id) elif fname.lower().endswith(".txt"): words = document.get_document_words(fpath) if len(words) > 10: self.texts.append(words) self.classifiers.append(label) self.labels.append(label_id) self.file_label.append(fname) nb_files += 1 else: #is not text file pass return nb_files ''' ''' def load_document_directorie(self, fpath, label): if label in self.labels_index.keys(): label_id = self.labels_index[label] nb = self.labels_files[label] else: label_id = len(self.labels_index) + 1 nb = 0 n = self.load_directory_for_document(fpath, label, label_id) if n > 5: self.labels_index[label] = label_id self.labels_name[label_id - 1] = label self.labels_files[label] = nb + n print(" === " + label + " : " + str(label_id) + " === ") def load_directories(self, path): for name in sorted(os.listdir(path)): fpath = os.path.join(path, name) if os.path.isdir(fpath): if name in self.folder_structure.keys(): label, level = self.folder_structure[name] if level == 0: continue elif not label: self.load_directories(fpath) continue else: label = name self.load_document_directorie(fpath, label) def load_text_files(self, text_path): self.texts = [] self.labels_index = {} self.labels_name = {} self.labels = [] self.file_label = [] # load defined directory type folder = Folder() # folder structure define classifier document type self.folder_structure = folder.load_folder_structure(text_path) for name in sorted(os.listdir(text_path)): path = os.path.join(text_path, name) if name != variables.TEMP_DIR and name != variables.DATA_DIR and os.path.isdir( path): if name in self.folder_structure.keys(): label, level = self.folder_structure[name] if level == 0: continue else: if label: self.load_document_directorie(path, label) else: self.load_directories(path) ''' load all file from merged files ''' def load_data_files(self, text_path): self.texts = [] self.labels_index = {} self.labels_name = {} self.labels = [] self.file_label = [] # load defined directory type folder = Folder() # folder structure define classifier document type self.folder_structure = folder.load_folder_structure(text_path) folder_order = folder.folder_order prevlabel = None for fname in sorted(os.listdir(text_path)): if (folder.CORPUS_FILE_NAME in fname): filename = os.path.join(text_path, fname) docs = folder.load_corpus_file(filename) for doc in docs: fname = doc[0][0] # file name name = doc[0][1] # type of file label = doc[0][2] # file label level = doc[0][3] # file level in directory if name in self.folder_structure.keys(): categorie, level = self.folder_structure[name] if categorie and ';' in categorie: label = categorie.split(';')[0] else: label = categorie if not categorie: savecat = None if name in self.folder_structure.keys(): for fn in folder_order: if fn == name and savecat != None: categorie = savecat if ';' in categorie: label = categorie.split(';')[0] else: label = categorie break else: categorie, level = self.folder_structure[ fn] if categorie: savecat = categorie else: categorie = doc[0][2] label = categorie sentences = doc[1:] norm_sentences = [ util.normalize_sentence(s) for s in sentences ] words = self.law_doc.get_normalize_document_from_sentences( norm_sentences, outtype=2) if len(words) > 0: # find same classifier if label in self.labels_index.keys(): label_id = self.labels_index[label] else: label_id = len(self.labels_index) + 1 self.labels_index[label] = label_id self.labels_name[label_id] = label # add document to text self.texts.append(words) # add label self.classifiers.append(categorie) # add label id self.labels.append(label_id) # add file name self.file_label.append(fname) if label in self.labels_files.keys(): self.labels_files[label] += 1 else: self.labels_files[label] = 1 if (prevlabel != label): print(" === " + label + " : " + str(label_id) + " === ") prevlabel = label def load_documents(self, text_path): if (text_path.endswith("training_data")): self.load_data_files(text_path) else: self.load_text_files(text_path) alldocs = [] doclists = [] for index in range(len(self.texts)): words = self.texts[index] if len(words) > 10: #string = 'doc_' + str(index+1) docs = TaggedDocument(words, tags=[index]) doclists.append(docs) alldocs.append(words) return alldocs, doclists def get_document_words(self, filename, f=None): f = File(filename, None, verbose=0) words = f.get_document_words() if len(words) > 10: return words else: return None def getTaggedDocuments(self, filename, index): words = self.get_document_words(filename) if words: return TaggedDocument(words, tags=[index]) else: return None def TrainingDoc2Vec(self, documents, size, window, nb_loop=NB_LOOP): #doc to vector #model = doc2vec.Doc2Vec(documents, size=size, window=window) alpha = self.V_ALPHA min_alpha = self.V_MIN_ALPHA model = doc2vec.Doc2Vec(documents, size=size, window=window, alpha=alpha, min_alpha=min_alpha) #model.sort_vocab() #model.build_vocab(documents) for epoch in range(nb_loop): random.shuffle(documents) model.train(documents, total_examples=len(documents), epochs=1, start_alpha=alpha, end_alpha=min_alpha) alpha -= self.V_ALPHA_RATE min_alpha = alpha # decrease the learning rate model.alpha -= self.V_ALPHA_RATE # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay #err, err_count, test_count, predictor = error_rate_for_model(model, documents, test_docs) print("epoch = %d alpha = %f\n" % (epoch, model.alpha)) return model def create_dataset(self, text_path, min_count=2, sg=0, workers=1, size=256, window=5): """ min_count : ignore all words with total frequency lower than this. sg : sg = O CBOW, sg=1 skip-gram workers: thread size : dimension feature vectors. window : maximum distance between the current and predicted word within a sentence. """ dtn_logger.logger_info("DATASET", "create dataset " + text_path) lists, doclists = self.load_documents(text_path) dictionary = corpora.Dictionary(lists) corpus = [dictionary.doc2bow(text) for text in lists] # save corpus corpusfname = self.get_mm_file_name() corpora.MmCorpus.serialize(corpusfname, corpus) # save dictionay dictfname = self.get_dict_file_name() dictionary.save(dictfname) dictfname = self.get_model_list_name() # initialize a model tfidf = models.TfidfModel(corpus, normalize=True) # use the model to transform vectors corpus_tfidf = tfidf[corpus] # initialize an LSI transformation, LSI 2-D space lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) lsi.save(dictfname) # same for tfidf, lda, ... #training doc2vec model = self.TrainingDoc2Vec(doclists, size=size, window=window, nb_loop=32) # save doc vector vectfname = self.get_model_file_name() model.save(vectfname) # word to vector model = word2vec.Word2Vec(lists, min_count=min_count, sg=sg, workers=workers, size=size, window=window) # save words vector vectfname = self.get_word_model_name() model.wv.save_word2vec_format(vectfname, binary=False) #model.sort_vocab() #model.build_vocab(sentences, update=False) # save file label self.save_filelabel() # save doc label self.save_doclabel() # save vector labels self.save_labelset() # save classifier labels self.save_classifierlabel() def load_wordvect(self): fname = self.get_word_model_name() f = codecs.open(fname, 'r', 'utf-8') sentences = f.read() sentences = sentences.split('\n') sentences = [s.split()[1:] for s in sentences] word2vec = [' '.join(s) for s in sentences if len(s) > 0] f.close() return word2vec def load_docvect(self): fname = self.get_model_file_name() model = doc2vec.Doc2Vec.load(fname) nbdocs = len(model.docvecs) resultlist = [] for i in range(nbdocs): #string = 'doc_' + str(i+1) #resultlist.append(model.docvecs[string]) vv = model.docvecs[i] vv = [v for v in vv] resultlist.append(vv) return resultlist def load_labelset(self): fname = self.get_data_file_name(variables.VECT_LABEL, True) f = codecs.open(fname, 'r', 'utf-8') labelSet = [int(line) for line in f if len(line.strip()) > 0] f.close() return labelSet def save_labelset(self): fname = self.get_data_file_name(variables.VECT_LABEL, True) f = codecs.open(fname, 'w', 'utf-8') for v in self.labels: f.write("%s\n" % (v)) f.close() def load_doclabel(self): fname = self.get_data_file_name(variables.DOC_LABEL, True) f = codecs.open(fname, 'r', 'utf-8') labelSet = [line.split('=')[0] for line in f if len(line.strip()) > 0] f.close() return labelSet def save_doclabel(self): # save doc label fname = self.get_data_file_name(variables.DOC_LABEL, True) f = codecs.open(fname, 'w', "utf-8") #for k, v in labels_index.items(): for v, k in self.labels_name.items(): f.write("%s=%d=%d\n" % (k, v, self.labels_files[k])) f.close() def load_filelabel(self): fname = self.get_data_file_name(variables.FILE_LABEL, True) f = codecs.open(fname, 'r', 'utf-8') labelSet = [line.split('=')[0] for line in f if len(line.strip()) > 0] f.close() return labelSet def save_filelabel(self): # save file label fname = self.get_data_file_name(variables.FILE_LABEL, True) f = codecs.open(fname, 'w', "utf-8") for index in range(len(self.file_label)): k = self.file_label[index] v = self.labels[index] f.write("%s=%d\n" % (k, v)) f.close() def load_classifierlabel(self): fname = self.get_data_file_name(variables.CLASSIFY_LABEL, True) f = codecs.open(fname, 'r', 'utf-8') labelSet = [line.strip() for line in f if len(line.strip()) > 0] f.close() return labelSet def save_classifierlabel(self): # save classifier label fname = self.get_data_file_name(variables.CLASSIFY_LABEL, True) f = codecs.open(fname, 'w', "utf-8") for v in self.classifiers: f.write("%s\n" % (v)) f.close() def test_corpus_dictionary(self): dictfname = self.get_dict_file_name() if (os.path.exists(dictfname)): dictionary = corpora.Dictionary.load(dictfname) corpusfname = self.get_mm_file_name() corpus = corpora.MmCorpus(corpusfname) print(corpus) else: print("corpus dictionary does not exist") def test_word_vector_model(self): vectfname = self.get_word_model_name() #sentences = LineSentence(vectfname) #sentences = Text8Corpus(vectfname) #sentences = LineSentence('compressed_text.txt.bz2') #sentences = LineSentence('compressed_text.txt.gz') model = word2vec.Word2Vec.load_word2vec_format(vectfname, binary=False) print("Test word2vec most similar for 驾驶, 机动车, 交通运输") print(model.most_similar(positive=['驾驶']))
class Contract(object): ''' create legal terms classifier input model : data/terms/template output model : data/models ''' def __init__(self, debug=0, crf_model=True): self.texts = [] # list of legal terms tests self.terms_index = {} # mapping legal term name to numeric id self.terms_name = {} # legal term name self.terms_label = [] # mapping legal term name to label self.labels = [] # list of legal term label ids self._debug = debug self.seg = Segmentation() self.seg.load_suggest_words() self.lawdocument = LawDocument() self.clause = Clause() self.doc_type = None self.doc_path = None self.labor_model = True self.crf_model = crf_model def get_data_file_name(self, dataname, categorie='models') : path = variables.get_data_file_name(self.doc_path, categorie=categorie) if not os.path.exists(path) : os.mkdir(path) return os.path.join(path, dataname) def get_term_model_name(self) : return self.get_data_file_name(variables.TERM_DOC_MODEL) # term vector [0, 0, 1, 1, ...] def load_term_set(self) : fname = self.get_data_file_name(variables.TERM_VECT) f = codecs.open(fname, 'r', 'utf-8') termSet = [int(line) for line in f if len(line.strip()) > 0] f.close() return termSet def save_term_set(self) : fname = self.get_data_file_name(variables.TERM_VECT) f = codecs.open(fname, 'w', 'utf-8') for v in self.labels : f.write("%s\n" % (v)) f.close() # term name [termname=termid] def load_term_label(self) : fname = self.get_data_file_name(variables.TERM_LABEL) f = codecs.open(fname, 'r', 'utf-8') labelSet = [line.split('=')[0] for line in f if len(line.strip()) > 0] f.close() return labelSet def save_term_label(self) : fname = self.get_data_file_name(variables.TERM_LABEL) f = codecs.open(fname, 'w', 'utf-8') for v, k in self.terms_name.items(): f.write("%s=%d\n" % (k, v)) f.close() # term list def load_term_list(self) : fname = self.get_data_file_name(variables.TERM_LIST) f = codecs.open(fname, 'r', 'utf-8') termList = [line.split('=')[0] for line in f if len(line.strip()) > 0] f.close() return termList def save_term_list(self) : fname = self.get_data_file_name(variables.TERM_LIST) f = codecs.open(fname, 'w', 'utf-8') for index in range(len (self.terms_label)): k = self.terms_label[index] v = self.labels[index] f.write("%s=%d\n" % (k, v)) f.close() def _convert(self, text_path, convert=False) : path = text_path; if text_path.endswith("doc") : doc_path = text_path path = text_path[0:-3] + "TEXT" if (convert and os.path.exists(doc_path)) : conv = Convert(verbose=0) o_file = conv.open_output(doc_path, path) conv.files_to_text(doc_path, o_file) conv.close_output() return path def get_term_words(self, text) : if isinstance(text, string_types) : sentences = [text] else : sentences = text words_all_filter = self.seg.segment(sentences)[2] words = [] for sentences in words_all_filter : for w in sentences : if len(w.strip()) > 0 : words.append(w.strip()) return words def segment_terms(self, term_sentences): """ Arguments : term_sentences : test term sentences return segmentation words """ words_all_filter = self.seg.segment(term_sentences)[2] return words_all_filter def get_terms(self, filename, encoding="utf-8"): terms = [] self.lawdocument.create_document(filename, encoding) if len(self.lawdocument.sections) > 0 : for p in self.lawdocument.sections : term_sentences = [] term_sentences.append(p.title) for s in p.sentences : term_sentences.append(s[0]) # document sentence [s, num, type] terms.append(term_sentences) # if doc is not law document else : for p in self.lawdocument.document_header : terms.append([p]) pass return terms def load_file(self, filename, encoding="utf-8"): # directory name is document type ftype = os.path.basename(os.path.dirname(filename)) self.clause.create_clauses(filename, encoding=encoding) for p in self.clause.sections : name = p.title term_sentences = [] term_sentences.append(name) for s in p.sentences : term_sentences.append(s) # add term vector self.texts.append(self.segment_terms(term_sentences)) if name in self.terms_index : label_id = self.terms_index[name] else : label_id = len(self.terms_index)+1 self.terms_index[name] = label_id self.terms_name[label_id] = name self.labels.append(label_id) self.terms_label.append(name+":"+ftype) def load_directory(self, path) : for fname in sorted(os.listdir(path)): fpath = os.path.join(path, fname) if os.path.isdir(fpath): if variables.noloaddir(fname) : continue self.load_directory(fpath) elif fname.endswith(".txt"): self.load_file(fpath) else : #is not text file pass def load_terms(self, text_path) : self.load_directory(text_path) termdocs = [] # term doc2vec allterms = [] # term contents for index in range(len(self.texts)) : term = self.texts[index] if len(term) > 0 : s = [] for sentences in term : for word in sentences : s.append(word) #string = 'doc_' + str(index+1) docs = TaggedDocument(s, tags = [index]) termdocs.append(docs) allterms.append(s) return allterms, termdocs def _create_terms(self, text_path, doctype=None, min_count=2, sg=0, workers=1, size=256, window=5) : """ min_count : ignore all words with total frequency lower than this. sg : sg = O CBOW, sg=1 skip-gram workers: thread size : dimension feature vectors. window : maximum distance between the current and predicted word within a sentence. """ self.texts = [] # list of legal terms tests self.terms_index = {} # mapping legal term name to numeric id self.terms_name = {} # legal term name self.terms_label = [] # mapping legal term name to label self.labels = [] # list of legal term label ids self.doc_type = doctype self.doc_path = doctype path = text_path allterms, termdocs = self.load_terms(path) # if there is no more clauses, do nothing if len(allterms) < 10 : return dictionary = corpora.Dictionary(allterms) corpus = [dictionary.doc2bow(text) for text in allterms] # save corpus corpusfname = self.get_data_file_name(variables.TERM_MODEL_MM) corpora.MmCorpus.serialize(corpusfname, corpus) # save dictionary dictfname = self.get_data_file_name(variables.TERM_MODEL_DICT) dictionary.save(dictfname) dictfname = self.get_data_file_name(variables.TERM_MODEL_LSI) # initialize a model tfidf = models.TfidfModel(corpus, normalize=True) # use the model to transform vectors corpus_tfidf = tfidf[corpus] # initialize an LSI transformation, LSI 2-D space lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) lsi.save(dictfname) # same for tfidf, lda, ... #trainig doc2vec datasets = Datasets() model = datasets.TrainingDoc2Vec(termdocs, size, window, 16) # save doc vector vectfname = self.get_term_model_name() model.save(vectfname) ''' # word to vector model = word2vec.Word2Vec(allterms, min_count=min_count, sg=sg, workers=workers, size=size, window=window) # save words vector vectfname = self.get_data_file_name(variables.TERM_WORD_MODEL) model.wv.save_word2vec_format(vectfname, binary=False) ''' # save term list self.save_term_list() # save term vector self.save_term_set() # save term name self.save_term_label() def create_crf(self, path) : crf = CRF() if self.labor_model : fpath = path + "/劳动合同" ftype = "劳动合同" crf.create_categorie_tagging(fpath, ftype) else : crf.create_crf_model() def create_terms(self, text_path, convert=False) : path = self._convert(text_path, convert) if self.crf_model : self.create_crf(path) else : self._create_terms(path, doctype=None) for doctype in sorted(os.listdir(path)): fpath = os.path.join(path, doctype) if os.path.isdir(fpath): self._create_terms(fpath, doctype=doctype)
def __init__(self): """ """ self.law_document = LawDocument() self.all_keywords = util.load_legalterm_type()
class Summarize(object): CLUSTER_THRESHOLD = 5 # Distance between words to consider def __init__(self, filename=None): """ """ self.law_document = LawDocument() self.important_word = [] self.top_n_scored = [] self.mean_scored = [] def load_keywords(self): filename = self.law_document.get_keywords_file_name() f = codecs.open(filename, 'r', 'utf-8') self.important_word = [] for line in f: if line.strip(): tokens = line.strip().split(" ") if tokens[0].strip(): word = [tokens[0].strip()] if len(tokens) > 1 and tokens[1].strip(): word.append(int(tokens[1].strip())) else: word.append(0) if len(tokens) > 2 and tokens[2].strip(): word.append(int(tokens[2].strip())) self.important_word.append(word) f.close() return self.important_word def _cluster_sentences(self, s, important_word): word_idx = [] clusters = [] # For each word in the keyword list for [w, n] in important_word: word = w.strip() if word: try: index = s.index(word) word_idx.append(index) if n == 1: index = index + 1 word_idx.append(index) except ValueError: # w not in this particular sentence pass # Using the word index, compute clusters by using a max distance threshold, # for any two consecutive words if len(word_idx) > 0: word_idx.sort() cluster = [word_idx[0]] i = 1 while i < len(word_idx): if word_idx[i] - word_idx[i - 1] < self.CLUSTER_THRESHOLD: cluster.append(word_idx[i]) else: clusters.append(cluster[:]) cluster = [word_idx[i]] i += 1 clusters.append(cluster) return clusters def _score_sentences(self, sentences, important_word): scores = [] sentence_idx = -1 for [s, idx, type] in sentences: sentence_idx += 1 clusters = self._cluster_sentences(s, important_word) if len(clusters) == 0: continue # Score each cluster. The max score for any given cluster is the score # for the sentence max_cluster_score = 0 for c in clusters: significant_words_in_cluster = len(c) total_words_in_cluster = c[-1] - c[0] + 1 score = 1.0 * significant_words_in_cluster \ * significant_words_in_cluster / total_words_in_cluster if score > max_cluster_score: max_cluster_score = score if score > max_cluster_score: max_cluster_score = score scores.append((sentence_idx, score)) return scores def analyze(self, filename, withWeight=True, encoding="utf-8"): self.law_document.analyze_file(filename) self.load_keywords() scored_sentences = self._score_sentences(self.law_document.sentences, self.important_word) # Summaization Approach 1: # Filter out non-significant sentences by using the average score plus a # fraction of the std dev as a filter avg = numpy.mean([s[1] for s in scored_sentences]) std = numpy.std([s[1] for s in scored_sentences]) ff = avg + 0.5 * std self.mean_scored = [] for (sent_idx, score) in scored_sentences: if score > ff: self.mean_scored.append((sent_idx, score)) # Summarization Approach 2: # Another approach would be to return only the top N ranked sentences self.top_n_scored = sorted(scored_sentences, key=lambda s: s[1]) self.top_n_scored = sorted(self.top_n_scored, key=lambda s: s[0]) def write_top_summarize(self, show_nb=5, outputfile=None, mode="a+"): if outputfile != None: f = codecs.open(outputfile, mode, 'utf-8') f.write(' '.join(self.law_document.document_title) + "\n") f.write('\n'.join(self.law_document.table_contents)) f.write("\n\n摘要 : \n") else: f = None print('摘要 : ' + ' '.join(self.law_document.document_title) + "\n") n_sentence = 0 for (idx, score) in self.top_n_scored: if n_sentence < show_nb: sentence = self.law_document.get_document_chapiter(idx) if sentence: if f != None: f.write(sentence + "\n\n") else: print(sentence) print("=" * 20) n_sentence += 1 else: if f != None: f.write("\n" + "*" * 30 + "\n\n") break def write_summarize(self, show_nb=5, outputfile=None, mode="a+"): if outputfile != None: f = codecs.open(outputfile, mode, 'utf-8') f.write('摘要 : \n' + ' '.join(self.law_document.document_title) + "\n") else: f = None print('摘要 : ' + ' '.join(self.law_document.document_title) + "\n") self.law_document.init_sentence_index() n_sentence = 0 for (idx, score) in self.mean_scored: if n_sentence < show_nb: sentence = self.law_document.get_document_chapiter(idx) if sentence: if f != None: f.write(sentence) else: print(sentence) print(" " * 20) n_sentence += 1 else: if f != None: f.write("*" * 30) break
class TermsVerification(object): SIMU_SEUIL = 0.6 def __init__(self): self.contract = Contract(0) self.verified_terms = {} self._filetime = None self.fullname = None self.filename = None self._title = None self._contract_date = None self.keywords = [] self.segment = Segmentation() self.document = LawDocument() def _init_terms_table(self, filename, termtype): self.categorie = termtype # get file name self.fullname = filename self.filename = os.path.basename(filename).split('.')[0] # get file created date self._filetime = util.get_creation_file_date(filename) # init verfying tab self.verified_terms = {} self.keywords = dtn_sentence.get_document_categorie(termtype) for key in self.keywords: self.verified_terms[key] = ExtractData(key, termtype) dtn_logger.logger_info("VERIFY", "%s (%s)" % (filename, termtype)) def _load_terms_model(self, doctype=None): self.contract.doc_path = doctype self.term_names = self.contract.load_term_label() self.term_set = self.contract.load_term_set() self.term_list = self.contract.load_term_list() fname = self.contract.get_term_model_name() self.model = doc2vec.Doc2Vec.load(fname) def similar_term(self, term_words, termtype): tname = None ttype = None simu = 0.0 docvec = self.model.infer_vector(doc_words=term_words) sims = self.model.docvecs.most_similar(positive=[docvec], topn=5) for i in range(len(sims)): n_term = int(sims[i][0]) f_simu = sims[i][1] if f_simu > self.SIMU_SEUIL: if (n_term >= len(self.term_list)): continue '''term = self.term_names[self.term_set[n_term]-1]''' term_name = self.term_list[n_term] if ':' in term_name: tab = term_name.split(':', 1) if tab[1] == termtype: if tname == None: tname = tab[0] ttype = tab[1] simu = f_simu break elif term_name == termtype: tname = term_name ttype = term_name simu = f_simu break else: break return tname, ttype, simu def verify_term(self, text): term_words = self.contract.get_term_words(text) return self.similar_term(term_words) def _add_verified_sentences(self, termname, n_start, end_char, simu): nl = n_start st = self.document.norm_sentences[nl] ps = self.document.parser_sentence(st) if ps: st = ps[1] if ps[1][-1] is not ' ' and ps[2][0] is not ' ': st += ' ' st += ps[2] self.verified_terms[termname].add_value(st, simu) while len(st) == 0 or st[-1] != end_char: nl += 1 st = self.document.norm_sentences[nl] self.verified_terms[termname].add_value(st, 1) ''' get document term ''' def get_terms(self, filename, filetype): if (filename.endswith(".txt")): ofile = filename else: ofile = docutonelocate.convert_file(filename) #lawdocument.create_document(ofile, filetype) self.document.read_section(ofile) self._title = self.document.document_name self._contract_date = self.document.document_date if self._title: if '文件名称' in self.keywords: self.verified_terms['文件名称'].add_value(self._title, 1) elif '合同名称' in self.keywords: self.verified_terms['合同名称'].add_value(self._title, 1) if self._contract_date: if '签约日期' in self.keywords: self.verified_terms['签约日期'].add_value(self._contract_date, 1) elif '签发日期' in self.keywords: self.verified_terms['签发日期'].add_value(self._contract_date, 1) elif '合同日期' in self.keywords: self.verified_terms['合同日期'].add_value(self._contract_date, 1) terms = [] ''' prev_sentence = '' for s in ld.document_header : prev_sentence += s if ld._is_sentence_end(s) : terms.append([prev_sentence]) prev_sentence = '' if prev_sentence : terms.append([prev_sentence]) ''' nb = len(self.document.sections) if nb > 0: index = 0 while index < nb: p = self.document.sections[index] index += 1 ''' if section title = term name add it to verfied table ''' if p.title: termname = dtn_sentence.get_keywords_by_name( p.title, self.keywords) if termname: if len(p.sentences) > 0: for s in p.sentences: if isinstance(s, str): self.verified_terms[termname].add_value( s, 1) else: s_line = s[0] self._add_verified_sentences( termname, s[1], s_line[-1], 1) while index < nb: sp = self.document.sections[index] index += 1 if sp.level > p.level: for s in sp.sentences: if isinstance(s, str): self.verified_terms[ termname].add_value(s, 1) else: s_line = s[0] self._add_verified_sentences( termname, s[1], s_line[-1], 1) else: ''' back to prev section ''' index -= 1 break if len(p.sentences) > 0: terms.append(p.sentences) return terms def _verified_clauses(self, filename, termtype): terms = self.get_terms(filename, termtype) for term in terms: sentences = [s[0] for s in term] n_start = term[0][1] end_char = sentences[-1][-1] term_words = self.contract.get_term_words(sentences) tname, ttype, simu = self.similar_term(term_words, termtype) if ttype != None and tname != None: if ttype == termtype: if tname in self.verified_terms.keys(): ''' for s in sentences : self.verified_terms[tname].add_value(s, simu) ''' self._add_verified_sentences(tname, n_start, end_char, simu) def create_contract_model(self, fpath): self.contract.create_terms(fpath) def get_contract_date(self): return time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(self._filetime)) def verify_document(self, filename, doctype, termtype): # init clause table self._init_terms_table(filename, termtype) # load lagal terms training model self._load_terms_model(doctype) self._verified_clauses(filename, termtype) sorted_list = [] for key in self.keywords: if key in self.verified_terms.keys(): term = self.verified_terms[key].term_value if (len(term) > 0): sorted_list.append((key, 1, term)) else: sorted_list.append((key, 0)) return sorted_list def _to_html_text(self, term_list): lists = [] for elem in term_list: if len(elem) == 3: name, _, data = elem else: continue text = "" if len(data) > 0: for v, s_simu in data: if s_simu > 0: # is term name and find term string s = dtn_sentence.get_sentence(v) ss = dtn_document.law_document.parser_sentence(s) text += '<p>' if ss: text += '<b>' + ss[1] + ' ' + ss[2] + '</b></p>' text += '<p>' # empty line else: text += s text += '</p>' lists.append([name, text]) else: lists.append([name, text]) return lists def to_json(self, term_list): result = {} ''' result["FILE"] = [self.fullname] result["TEMPS"] = [str(self._filetime)] result["TTILE"] = [self._title] ''' result["filename"] = [self.filename, self.fullname, self.categorie] result["result"] = self._to_html_text(term_list) #result["result"] = self._to_list(lists) docutonejson.print_json(result) def example0(self): fname = config.TEST_PATH + "/劳动合同/Chanel劳动合同.docx.txt" ftype = "劳动合同" term_list = self.verify_document(fname, None, ftype) self.to_json(term_list) def example1(self): fname = config.TEST_PATH + "/章程/华能国际电力股份有限公司章程.pdf.txt" ftype = "有限责任公司章程" term_list = self.verify_document(fname, None, ftype) self.to_json(term_list) def example2(self): fname = config.TEST_PATH + "/章程/华能国际电力股份有限公司章程.docx.txt" ftype = "有限责任公司章程" term_list = self.verify_document(fname, None, ftype) self.to_json(term_list)
class Extraction(object): def __init__(self): """ """ self.law_document = LawDocument() self.all_keywords = util.load_legalterm_type() def score_sentences(self, sentences, important_word): scores = {} for s in sentences: for word in important_word: word = word[0] if word in s : index = s.index(word) index = index + len(word) sentence = s[index:] seg = Segmentation() #compare prev sentence if word in scores : n = scores[word][1] if index < n : scores[word] = [sentence, index] else : scores[word] = [sentence, index] # find only one word break return scores def extraction(self, filename, doctype='营业执照', encoding="utf-8"): document = self.law_document.get_segment_document(filename) [0] sentences = [] for sentence in document : s = "" for word in sentence : if word in util.sentence_delimiters : s += word + ' ' else : s += word sentences.append(s) important_words = self.all_keywords[doctype] scored_sentences = self.score_sentences(sentences, important_words) return scored_sentences def extraction_documents(self, fpath, doctype='营业执照') : data = {} for name in sorted(os.listdir(fpath)): if name.endswith('.txt') : fname = os.path.join(fpath, name) data[fname] = self.extraction(fname, doctype) return data def write_result(self, scored_sentences, important_word, outputfile = None): f = None for word in important_word : word = word[0] if word in scored_sentences : sentence = scored_sentences[word][0] if f != None : f.write(sentence) else : print (word + " : " + sentence) print ("="*40) def write_documents_info(self, data, doctype) : important_words = self.all_keywords[doctype] for fname, scored_sentences in data.items() : self.write_result(scored_sentences, important_words) def extraction_ner(self, st, filename, doctype='营业执照') : document = self.law_document.get_segment_document(filename) [0] prevtype = 'O' string = "" for sentence in document : sttag = st.tag(sentence) for word, type in sttag : if type == 'GPE' or type == 'ORG' or type == 'PRESON' : if prevtype == 'O' or prevtype == type : string += word prevtype = type else : print('%s %s ' % (string, prevtype)) string = "" prevtype = 'O' else : if len(string) > 0 : print('%s %s ' % (string, prevtype)) string = "" prevtype = 'O' def test_ner(self, fpath, doctype='营业执照') : from nltk.tag.stanford import StanfordNERTagger st = StanfordNERTagger('D:/WORK/docutone/java/classifiers/chinese.misc.distsim.crf.ser.gz', 'D:/WORK/docutone/java/lib/stanford-ner-3.7.0.jar') data = {} for name in sorted(os.listdir(fpath)): if name.endswith('.txt') : fname = os.path.join(fpath, name) data[fname] = self.extraction_ner(st, fname, doctype) return data def test_polyglot1(self) : import polyglot from polyglot.text import Text, Word text = Text("Bonjour, Mesdames.") print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name)) text = Text("第一条 机动车第三者责任保险合同(以下简称本保险合同)由保险条款、投保单、保险单、批单和特别约定共同组成。 " "本保险合同争议处理适用中华人民共和国法律。") #print(text.entities) """ print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30) for word, tag in text.pos_tags: print(u"{:<16}{:>2}".format(word, tag)) """ word = Word("Obama", language="en") word = Word("中华人民共和国", language="zh") print("Neighbors (Synonms) of {}".format(word)+"\n"+"-"*30) for w in word.neighbors: print("{:<16}".format(w)) print("\n\nThe first 10 dimensions out the {} dimensions\n".format(word.vector.shape[0])) print(word.vector[:10]) def test_polyglot(self) : from polyglot.mapping import Embedding embeddings = Embedding.load("/home/rmyeid/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2") neighbors = embeddings.nearest_neighbors("green")
class Text4Sentences(object): def __init__(self, stopwords_file=None): """ Keyword arguments: stopwords_file : stopwords file name """ self.pagerank_config = { 'alpha': 0.85, } self.seg = Segmentation(stopwords_file=stopwords_file) self.law_document = LawDocument() self.sentences = None self.words_no_filter = None # 2维列表 self.words_no_stop_words = None self.words_all_filters = None self.key_sentences = None def create_segment_sentences(self, sentences, sim_func=util.get_similarity): """ Keyword arguments: sentences : sentences of document sim_func 指定计算句子相似度的函数。 """ self.words_no_filter, self.words_no_stop_words, self.words_all_filters = self.seg.segment( sentences) self.sentences = sentences self.key_sentences = util.sort_sentences( sentences=self.sentences, words=self.words_no_filter, sim_func=sim_func, pagerank_config=self.pagerank_config) def analyze_file(self, filename, encoding='utf-8'): """ Keyword arguments: filename : input file name """ f = self.law_document.create_document(filename=filename) self.create_segment_sentences( self.law_document.get_segmented_document()) def get_key_sentences(self, num=6): """ num : 个句子用来生成摘要。 Return: important sentences。 """ result = [] count = 0 for item in self.key_sentences: if count >= num: break result.append(item) count += 1 return result def show_key_sentences(self): for item in self.get_key_sentences(2): [sentence, idx, stype] = item['sentence'] print(sentence) print("=" * 20) print(self.law_document.get_document_chapiter(idx, chapiter=True)) print("--" * 20)
def test_imdb(self): from keras.callbacks import ModelCheckpoint from keras.utils import np_utils law_document = LawDocument() fname = os.path.join( variables.BASE_DIR, 'data/Corpus/TEXT/合同、协议/劳动合同/1. 劳动合同- 最终版.DOC.txt') sentences = law_document.get_sentences(fname) # tokenizer tokenizer = Tokenizer(nb_words=self.MAX_NB_WORDS) tokenizer.fit_on_texts([sentences]) self.sequences = tokenizer.texts_to_sequences([sentences]) self.word_index = tokenizer.word_index seq_length = 10 data = [m for m in self.word_index.values()] index_word = {} for w, id in list(self.word_index.items()): index_word[id] = w dataX = [] dataY = [] length = len(data) - seq_length for i in range(0, length, seq_length): seq_in = data[i:i + seq_length - 1] seq_out = data[i + seq_length] dataX.append(seq_in) dataY.append(seq_out) """ raw_text = sentences chars = sorted(list(set("word telphone main"))) # create mapping of unique chars to integers chars = sorted(list(set(raw_text))) char_to_int = dict((c, i) for i, c in enumerate(chars)) int_to_char = dict((i, c) for i, c in enumerate(chars)) # summarize the loaded data n_chars = len(raw_text) n_vocab = len(chars) print ("Total Characters: ", n_chars) print ("Total Vocab: ", n_vocab) # prepare the dataset of input to output pairs encoded as integers seq_length = 100 dataX = [] dataY = [] for i in range(0, n_chars - seq_length, 1): seq_in = raw_text[i:i + seq_length] seq_out = raw_text[i + seq_length] dataX.append([char_to_int[char] for char in seq_in]) dataY.append(char_to_int[seq_out]) """ n_patterns = len(dataX) print("Total Patterns: ", n_patterns) # reshape X to be [samples, time steps, features] X = np.reshape(dataX, (n_patterns, seq_length - 1, 1)) n_vocab = len(index_word) # normalize X = X / float(n_vocab) # one hot encode the output variable y = np_utils.to_categorical(dataY) # define the LSTM model model = Sequential() model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]))) model.add(Dropout(0.2)) model.add(Dense(y.shape[1], activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') # define the checkpoint filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] # fit the model #model.fit(X, y, nb_epoch=20, batch_size=128, callbacks=callbacks_list) start = np.random.randint(0, len(dataX) - 1) pattern = dataX[start] print("Seed:") print("\"", ''.join([index_word[value] for value in pattern]), "\"") # generate characters for i in range(1000): x = np.reshape(pattern, (1, len(pattern), 1)) x = x / float(n_vocab) prediction = model.predict(x, verbose=0) index = np.argmax(prediction) result = index_word[index] seq_in = [index_word[value] for value in pattern] sys.stdout.write(result) pattern.append(index) pattern = pattern[1:len(pattern)] print("\nDone.")