Exemplo n.º 1
0
    def __init__(self, modelname=None):

        self.MAX_SEQUENCE_LENGTH = 1000
        self.MAX_NB_WORDS = 20000
        self.EMBEDDING_DIM = 100
        self.VALIDATION_SPLIT = 0.25
        self.EPOCHS = 64
        self.BATCH_SIZE = 32
        self.POOL_SIZE = 5
        self.FILTERS = 64
        self.LSTM_OUTPUT_SIZE = 70

        if modelname == None:
            self.MODEL_NAME = "clause_model"
        else:
            self.MODEL_NAME = modelname

        self._document = LawDocument()
        self._clause = Clause()

        self.texts = []  # list of text samples
        self.labels_index = {}  # dictionary mapping label name to numeric id
        self.labels = []  # list of label ids
        self.label_name = []
        self._debug = 1
        self._save_model = False
Exemplo n.º 2
0
    def __init__(self, modelname=None):

        self._document = LawDocument()
        self.clause_model = clause_training.ClauseTraining()
        self.clause_model.load_model_label()

        pass
Exemplo n.º 3
0
 def __init__(self, filename=None):
     """
     """
     self.law_document = LawDocument()
     self.important_word = []
     self.top_n_scored = []
     self.mean_scored = []
Exemplo n.º 4
0
 def get_document_type(self, dictname) :
     
     textfname = "../dictionary/text/" + dictname  + ".txt"
     law_document = LawDocument()
     law_document.analyze(filename=textfname)
     
     text = law_document.document_type;
     
     return text
Exemplo n.º 5
0
 def get_document(self, dictname) :
     
     textfname = "../dictionary/text/" + dictname  + ".txt"
     law_document = LawDocument()
     law_document.analyze(filename=textfname)
     
     text = "\n".join(law_document.document_title);
     
     return text
Exemplo n.º 6
0
    def file_clean(self, filename):
        from docutone.core.document import LawDocument
        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)
        lawdoc = LawDocument()
        document = lawdoc.get_fusion_document(ofile)

        for sentence in document:
            print(' '.join(sentence))
Exemplo n.º 7
0
    def __init__(self):

        self.texts = []  # list of text samples
        self.labels_index = {}  # dictionary mapping label name to numeric id
        self.labels_files = {}  # dictionary mapping label name to numeric id
        self.labels_name = {}  # dictionary mapping label name to numeric id
        self.file_label = []  # file label id
        self.labels = []  # list of label ids
        self.classifiers = []  # list of classifier
        self.law_doc = LawDocument()
        self.folder_structure = {}
        self.folder_order = []

        pass
Exemplo n.º 8
0
    def file_named_tag(self, filename):
        from docutone.core.document import LawDocument
        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)
        lawdoc = LawDocument()
        document = lawdoc.get_fusion_document(ofile)

        self.new_ner = {}
        for sentence in document:
            self.get_sentence_named_tag(sentence)

        self.write_ner()
Exemplo n.º 9
0
    def __init__(self):

        self.contract = Contract(0)

        self.verified_terms = {}
        self._filetime = None
        self.fullname = None
        self.filename = None
        self._title = None
        self._contract_date = None
        self.keywords = []

        self.segment = Segmentation()
        self.document = LawDocument()
Exemplo n.º 10
0
    def __init__(self):
        
        self.law_doc =  LawDocument()


        self.file_index = 1
        self.folder_structure = {}
        self.folder_order = []
        
        self.corpus_document = []
        
        instance = Terms()
        self.categories = instance.get_all_term_items() 
        
        pass
Exemplo n.º 11
0
 def __init__(self, debug=0, crf_model=True):
     
     self.texts = []         # list of legal terms tests
     self.terms_index = {}  #  mapping legal term name to numeric id
     self.terms_name = {}   #  legal term name 
     self.terms_label = []  #  mapping legal term name to label
     self.labels = []        # list of legal term label ids
     self._debug = debug
     self.seg = Segmentation()
     self.seg.load_suggest_words()
     self.lawdocument = LawDocument()
     self.clause = Clause()
     self.doc_type = None
     self.doc_path = None
     self.labor_model = True
     self.crf_model = crf_model
Exemplo n.º 12
0
 def get_document_chapiter(self, sims, dictname) :
     
     textfname = "../dictionary/text/" + dictname  + ".txt"
     law_document = LawDocument()
     law_document.analyze(filename=textfname)
     text = "";
     n_line = 1
     for sim in sims :
         doc_no, simil = sim[0], sim [1]
         if (simil > 0.4) :
             text +=  "******** " + str(n_line) + "  ********\n"
             text += law_document.get_document_chapiter(doc_no) + "\n"
             n_line += 1
             if n_line > 2:
                 break;
         else :
             break
     return text
Exemplo n.º 13
0
    def __init__(self, stopwords_file=None):
        """
        Keyword arguments:
        stopwords_file :    stopwords file name
        """

        self.pagerank_config = {
            'alpha': 0.85,
        }

        self.seg = Segmentation(stopwords_file=stopwords_file)
        self.law_document = LawDocument()
        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.key_sentences = None
Exemplo n.º 14
0
    def __init__(self, filename=None):

        self.MAX_SEQUENCE_LENGTH = 1000
        self.MAX_NB_WORDS = 20000
        self.EMBEDDING_DIM = 100
        self.VALIDATION_SPLIT = 0.25

        self.embeddings_index = self.load_embedding_base()

        self._document = LawDocument()

        self.label_name = []
        self.texts = []  # list of text samples
        self.labels_index = {}  # dictionary mapping label name to numeric id
        self.labels = []  # list of label ids
        self._debug = 1

        pass
Exemplo n.º 15
0
    def search_document(self, textpath, filename) :
        ld = LawDocument()
        ld.analyze(filename=filename)
        
        
        doc_tab = []
        names = os.listdir("../dictionary/dict")
        n_file = 1
        for filename in os.listdir("../dictionary/dict") :
            if filename.endswith(".dict") :
                dictname = filename.replace('.dict', '')
                total = 0.0
                sentences = []
                for sentence in ld.table_contents :
                    if len(sentence) > 1 :
                        
                        sims = self.text_search_lsi(textpath, sentence[1])
                        total += self.get_similarity_value(sims)
                
                doc_tab.append([dictname, total])
        doc_tab = sorted(doc_tab, key=lambda total: total[1], reverse=True)

        return self.get_document_type(doc_tab[0][0])
Exemplo n.º 16
0
 def __init__(self):
     """
     """
     self.law_document = LawDocument()        
     self.all_keywords = util.load_legalterm_type()
Exemplo n.º 17
0
    def test_imdb(self):

        from keras.callbacks import ModelCheckpoint
        from keras.utils import np_utils

        law_document = LawDocument()

        fname = os.path.join(
            variables.BASE_DIR,
            'data/Corpus/TEXT/合同、协议/劳动合同/1. 劳动合同- 最终版.DOC.txt')
        sentences = law_document.get_sentences(fname)

        # tokenizer
        tokenizer = Tokenizer(nb_words=self.MAX_NB_WORDS)
        tokenizer.fit_on_texts([sentences])
        self.sequences = tokenizer.texts_to_sequences([sentences])
        self.word_index = tokenizer.word_index

        seq_length = 10
        data = [m for m in self.word_index.values()]

        index_word = {}
        for w, id in list(self.word_index.items()):
            index_word[id] = w

        dataX = []
        dataY = []
        length = len(data) - seq_length
        for i in range(0, length, seq_length):
            seq_in = data[i:i + seq_length - 1]
            seq_out = data[i + seq_length]
            dataX.append(seq_in)
            dataY.append(seq_out)
        """
        raw_text = sentences
        
        chars = sorted(list(set("word telphone main")))
        
        # create mapping of unique chars to integers
        chars = sorted(list(set(raw_text)))
        char_to_int = dict((c, i) for i, c in enumerate(chars))
        int_to_char = dict((i, c) for i, c in enumerate(chars))
        # summarize the loaded data
        n_chars = len(raw_text)
        n_vocab = len(chars)
        print ("Total Characters: ", n_chars)
        print ("Total Vocab: ", n_vocab)
        # prepare the dataset of input to output pairs encoded as integers
        seq_length = 100
        dataX = []
        dataY = []
        for i in range(0, n_chars - seq_length, 1):
            seq_in = raw_text[i:i + seq_length]
            seq_out = raw_text[i + seq_length]
            dataX.append([char_to_int[char] for char in seq_in])
            dataY.append(char_to_int[seq_out])
        """

        n_patterns = len(dataX)
        print("Total Patterns: ", n_patterns)

        # reshape X to be [samples, time steps, features]
        X = np.reshape(dataX, (n_patterns, seq_length - 1, 1))
        n_vocab = len(index_word)
        # normalize
        X = X / float(n_vocab)

        # one hot encode the output variable
        y = np_utils.to_categorical(dataY)
        # define the LSTM model
        model = Sequential()
        model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
        model.add(Dropout(0.2))
        model.add(Dense(y.shape[1], activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        # define the checkpoint
        filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='loss',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='min')
        callbacks_list = [checkpoint]
        # fit the model
        #model.fit(X, y, nb_epoch=20, batch_size=128, callbacks=callbacks_list)

        start = np.random.randint(0, len(dataX) - 1)
        pattern = dataX[start]
        print("Seed:")
        print("\"", ''.join([index_word[value] for value in pattern]), "\"")
        # generate characters
        for i in range(1000):
            x = np.reshape(pattern, (1, len(pattern), 1))
            x = x / float(n_vocab)
            prediction = model.predict(x, verbose=0)
            index = np.argmax(prediction)
            result = index_word[index]
            seq_in = [index_word[value] for value in pattern]
            sys.stdout.write(result)
            pattern.append(index)
            pattern = pattern[1:len(pattern)]
        print("\nDone.")