예제 #1
0
def Main(url):
    '''
    Entry Point.
    
    Args:
        url:    target url.
    '''
    # The object of Web-Scraping.
    web_scrape = WebScraping()
    # Execute Web-Scraping.
    document = web_scrape.scrape(url)
    # The object of NLP.
    nlp_base = NlpBase()
    # Set tokenizer. This is japanese tokenizer with MeCab.
    nlp_base.tokenizable_doc = MeCabTokenizer()

    sentence_list = nlp_base.listup_sentence(document)

    batch_size = 10
    if len(sentence_list) < batch_size:
        raise ValueError("The number of extracted sentences is insufficient.")

    all_token_list = []
    for i in range(len(sentence_list)):
        nlp_base.tokenize(sentence_list[i])
        all_token_list.extend(nlp_base.token)
        sentence_list[i] = nlp_base.token

    vectorlizable_sentence = LSTMRTRBM()
    vectorlizable_sentence.learn(sentence_list=sentence_list,
                                 token_master_list=list(set(all_token_list)),
                                 hidden_neuron_count=1000,
                                 batch_size=batch_size,
                                 learning_rate=1e-03,
                                 seq_len=5)
    test_list = sentence_list[:batch_size]
    feature_points_arr = vectorlizable_sentence.vectorize(test_list)

    print("Feature points (Top 5 sentences):")
    print(feature_points_arr)
def Main(url):
    '''
    Entry Point.
    
    Args:
        url:    target url.
    '''
    # The object of Web-Scraping.
    web_scrape = WebScraping()
    # Execute Web-Scraping.
    document = web_scrape.scrape(url)
    # The object of NLP.
    nlp_base = NlpBase()
    # Set tokenizer. This is japanese tokenizer with MeCab.
    nlp_base.tokenizable_doc = MeCabTokenizer()

    sentence_list = nlp_base.listup_sentence(document)

    all_token_list = []
    for i in range(len(sentence_list)):
        nlp_base.tokenize(sentence_list[i])
        all_token_list.extend(nlp_base.token)
        sentence_list[i] = nlp_base.token
        
    vectorlizable_sentence = EncoderDecoder()
    vectorlizable_sentence.learn(
        sentence_list=sentence_list, 
        token_master_list=list(set(all_token_list)),
        epochs=60
    )
    test_list = sentence_list[:5]
    feature_points_arr = vectorlizable_sentence.vectorize(test_list)
    reconstruction_error_arr = vectorlizable_sentence.controller.get_reconstruction_error().mean()
    
    print("Feature points (Top 5 sentences):")
    print(feature_points_arr)
    print("Reconstruction error(MSE):")
    print(reconstruction_error_arr)
    def learn(self,
              document,
              tokenizable_doc=None,
              hidden_neuron_count=200,
              epochs=100,
              batch_size=100,
              learning_rate=1e-05,
              learning_attenuate_rate=0.1,
              attenuate_epoch=50,
              bptt_tau=8,
              weight_limit=0.5,
              dropout_rate=0.5,
              test_size_rate=0.3,
              cluster_num=10,
              max_iter=100):
        '''
        Learning.
        
        Args:
            document:                       String of document.
            tokenizable_doc:                is-a `TokenizableDoc`.
            hidden_neuron_count:            The number of units in hidden layer.
            epochs:                         Epochs of Mini-batch.
            bath_size:                      Batch size of Mini-batch.
            learning_rate:                  Learning rate.
            learning_attenuate_rate:        Attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`.
            attenuate_epoch:                Attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`.
                                            Additionally, in relation to regularization,
                                            this class constrains weight matrixes every `attenuate_epoch`.

            bptt_tau:                       Refereed maxinum step `t` in Backpropagation Through Time(BPTT).
            weight_limit:                   Regularization for weights matrix
                                            to repeat multiplying the weights matrix and `0.9`
                                            until $\sum_{j=0}^{n}w_{ji}^2 < weight\_limit$.

            dropout_rate:                   The probability of dropout.
            test_size_rate:                 Size of Test data set. If this value is `0`, the 
            cluster_num:                    The number of clusters.
            max_iter:                       Maximum number of iterations.

        '''
        # The object of NLP.
        nlp_base = NlpBase()
        if tokenizable_doc is None:
            # Set tokenizer. This is japanese tokenizer with MeCab.
            nlp_base.tokenizable_doc = MeCabTokenizer()
        else:
            nlp_base.tokenizable_doc = tokenizable_doc

        sentence_list = nlp_base.listup_sentence(document)

        all_token_list = []
        for i in range(len(sentence_list)):
            nlp_base.tokenize(sentence_list[i])
            all_token_list.extend(nlp_base.token)
            sentence_list[i] = nlp_base.token

        token_master_list = list(set(all_token_list))
        vectorlizable_sentence = EncoderDecoder()
        vectorlizable_sentence.learn(
            sentence_list=sentence_list,
            token_master_list=token_master_list,
            hidden_neuron_count=hidden_neuron_count,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            learning_attenuate_rate=learning_attenuate_rate,
            attenuate_epoch=attenuate_epoch,
            bptt_tau=bptt_tau,
            weight_limit=weight_limit,
            dropout_rate=dropout_rate,
            test_size_rate=test_size_rate)
        self.__vectorlizable_sentence = vectorlizable_sentence
        self.__token_master_list = token_master_list

        feature_arr = vectorlizable_sentence.vectorize(sentence_list)

        self.__clusterable_doc = KMeans(
            cluster_num=cluster_num,
            max_iter=max_iter,
            init_noise_arr=np.random.normal(size=feature_arr.shape))
        self.__labeled_arr = self.__clusterable_doc.learn(feature_arr)
        self.__sentence_list = sentence_list
        self.__batch_size = batch_size
예제 #4
0
    def __init__(self,
                 document,
                 tokenizable_doc=None,
                 hidden_neuron_count=200,
                 epochs=100,
                 batch_size=100,
                 learning_rate=1e-05,
                 learning_attenuate_rate=0.1,
                 attenuate_epoch=50,
                 bptt_tau=8,
                 weight_limit=0.5,
                 dropout_rate=0.5,
                 test_size_rate=0.3,
                 debug_mode=False):
        '''
        Init.
        
        Args:
            document:                       String of document.
            tokenizable_doc:                is-a `TokenizableDoc`.
            hidden_neuron_count:            The number of units in hidden layer.
            epochs:                         Epochs of Mini-batch.
            bath_size:                      Batch size of Mini-batch.
            learning_rate:                  Learning rate.
            learning_attenuate_rate:        Attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`.
            attenuate_epoch:                Attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`.
                                            Additionally, in relation to regularization,
                                            this class constrains weight matrixes every `attenuate_epoch`.

            bptt_tau:                       Refereed maxinum step `t` in Backpropagation Through Time(BPTT).
            weight_limit:                   Regularization for weights matrix
                                            to repeat multiplying the weights matrix and `0.9`
                                            until $\sum_{j=0}^{n}w_{ji}^2 < weight\_limit$.

            dropout_rate:                   The probability of dropout.
            test_size_rate:                 Size of Test data set. If this value is `0`, the 
            debug_mode:                     Debug mode or not.
        '''
        if debug_mode is True:
            logger = getLogger("pydbm")
            handler = StreamHandler()
            handler.setLevel(DEBUG)
            logger.setLevel(DEBUG)
            logger.addHandler(handler)

            logger = getLogger("pysummarization")
            handler = StreamHandler()
            handler.setLevel(DEBUG)
            logger.setLevel(DEBUG)
            logger.addHandler(handler)

        # The object of NLP.
        nlp_base = NlpBase()
        if tokenizable_doc is None:
            # Set tokenizer. This is japanese tokenizer with MeCab.
            nlp_base.tokenizable_doc = MeCabTokenizer()
        else:
            nlp_base.tokenizable_doc = tokenizable_doc

        sentence_list = nlp_base.listup_sentence(document)

        all_token_list = []
        for i in range(len(sentence_list)):
            nlp_base.tokenize(sentence_list[i])
            all_token_list.extend(nlp_base.token)
            sentence_list[i] = nlp_base.token

        token_master_list = list(set(all_token_list))
        vectorlizable_sentence = EncoderDecoder()
        vectorlizable_sentence.learn(
            sentence_list=sentence_list,
            token_master_list=token_master_list,
            hidden_neuron_count=hidden_neuron_count,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            learning_attenuate_rate=learning_attenuate_rate,
            attenuate_epoch=attenuate_epoch,
            bptt_tau=bptt_tau,
            weight_limit=weight_limit,
            dropout_rate=dropout_rate,
            test_size_rate=test_size_rate)
        self.__vectorlizable_sentence = vectorlizable_sentence
        self.__token_master_list = token_master_list
    def learn(self,
              document,
              tokenizable_doc,
              hidden_neuron_count=1000,
              training_count=1,
              batch_size=10,
              learning_rate=1e-03,
              seq_len=5,
              cluster_num=10,
              max_iter=100):
        '''
        Learning.
        
        Args:
            document:                       String of document.
            tokenizable_doc:                is-a `TokenizableDoc`.
            hidden_neuron_count:            The number of units in hidden layer.
            training_count:                 The number of training.
            bath_size:                      Batch size of Mini-batch.
            learning_rate:                  Learning rate.
            seq_len:                        The length of one sequence.
            cluster_num:                    The number of clusters.
            max_iter:                       Maximum number of iterations.
        '''
        # The object of NLP.
        nlp_base = NlpBase()
        if tokenizable_doc is None:
            # Set tokenizer. This is japanese tokenizer with MeCab.
            nlp_base.tokenizable_doc = MeCabTokenizer()
        else:
            nlp_base.tokenizable_doc = tokenizable_doc

        sentence_list = nlp_base.listup_sentence(document)
        all_token_list = []
        for i in range(len(sentence_list)):
            nlp_base.tokenize(sentence_list[i])
            all_token_list.extend(nlp_base.token)
            sentence_list[i] = nlp_base.token

        token_master_list = list(set(all_token_list))
        vectorlizable_sentence = LSTMRTRBM()
        vectorlizable_sentence.learn(sentence_list=sentence_list,
                                     token_master_list=token_master_list,
                                     hidden_neuron_count=hidden_neuron_count,
                                     training_count=training_count,
                                     batch_size=batch_size,
                                     learning_rate=learning_rate,
                                     seq_len=seq_len)

        feature_arr = vectorlizable_sentence.vectorize(sentence_list)

        self.__clusterable_doc = KMeans(
            cluster_num=cluster_num,
            max_iter=max_iter,
            init_noise_arr=np.random.normal(size=feature_arr.shape))
        self.__labeled_arr = self.__clusterable_doc.learn(feature_arr)

        self.__vectorlizable_sentence = vectorlizable_sentence
        self.__token_master_list = token_master_list
        self.__sentence_list = sentence_list
        self.__batch_size = batch_size