def Main(url, similarity_mode="TfIdfCosine", similarity_limit=0.75): ''' Entry Point. Args: url: PDF url. ''' if similarity_mode == "TfIdfCosine": # The object of `Similarity Filter`. # The similarity observed by this object is so-called cosine similarity of Tf-Idf vectors. similarity_filter = TfIdfCosine() elif similarity_mode == "Dice": # The object of `Similarity Filter`. # The similarity observed by this object is the Dice coefficient. similarity_filter = Dice() elif similarity_mode == "Jaccard": # The object of `Similarity Filter`. # The similarity observed by this object is the Jaccard coefficient. similarity_filter = Jaccard() elif similarity_mode == "Simpson": # The object of `Similarity Filter`. # The similarity observed by this object is the Simpson coefficient. similarity_filter = Simpson() else: raise ValueError() # The object of the NLP. nlp_base = NlpBase() # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() # Set the object of NLP. similarity_filter.nlp_base = nlp_base # If the similarity exceeds this value, the sentence will be cut off. similarity_filter.similarity_limit = similarity_limit # The object of Web-scraping. web_scrape = WebScraping() # Set the object of reading PDF files. web_scrape.readable_web_pdf = WebPDFReading() # Execute Web-scraping. document = web_scrape.scrape(url) # The object of automatic sumamrization. auto_abstractor = AutoAbstractor() # Set tokenizer. This is japanese tokenizer with MeCab. auto_abstractor.tokenizable_doc = MeCabTokenizer() # Object of abstracting and filtering document. abstractable_doc = TopNRankAbstractor() # Execute summarization. result_dict = auto_abstractor.summarize(document, abstractable_doc, similarity_filter) # Output summarized sentence. [ print(result_dict["summarize_result"][i]) for i in range(len(result_dict["summarize_result"])) if i < 3 ]
def Main(url): ''' Entry Point. Args: url: target url. ''' # The object of Web-Scraping. web_scrape = WebScraping() # Execute Web-Scraping. document = web_scrape.scrape(url) # The object of NLP. nlp_base = NlpBase() # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() sentence_list = nlp_base.listup_sentence(document) batch_size = 10 if len(sentence_list) < batch_size: raise ValueError("The number of extracted sentences is insufficient.") all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token vectorlizable_sentence = LSTMRTRBM() vectorlizable_sentence.learn(sentence_list=sentence_list, token_master_list=list(set(all_token_list)), hidden_neuron_count=1000, batch_size=batch_size, learning_rate=1e-03, seq_len=5) test_list = sentence_list[:batch_size] feature_points_arr = vectorlizable_sentence.vectorize(test_list) print("Feature points (Top 5 sentences):") print(feature_points_arr)
def Main(url): ''' Entry Point. Args: url: target url. ''' # The object of Web-Scraping. web_scrape = WebScraping() # Execute Web-Scraping. document = web_scrape.scrape(url) # The object of NLP. nlp_base = NlpBase() # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() sentence_list = nlp_base.listup_sentence(document) all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token vectorlizable_sentence = EncoderDecoder() vectorlizable_sentence.learn( sentence_list=sentence_list, token_master_list=list(set(all_token_list)), epochs=60 ) test_list = sentence_list[:5] feature_points_arr = vectorlizable_sentence.vectorize(test_list) reconstruction_error_arr = vectorlizable_sentence.controller.get_reconstruction_error().mean() print("Feature points (Top 5 sentences):") print(feature_points_arr) print("Reconstruction error(MSE):") print(reconstruction_error_arr)
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor from pysummarization.tokenizabledoc.mecab_tokenizer import MeCabTokenizer from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor from pysummarization.nlp_base import NlpBase from pysummarization.similarityfilter.tfidf_cosine import TfIdfCosine # NLPのオブジェクト nlp_base = NlpBase() # トークナイザーを設定します。 これは、MeCabを使用した日本語のトークナイザーです nlp_base.tokenizable_doc = MeCabTokenizer() # 「類似性フィルター」のオブジェクト。 # このオブジェクトによって観察される類似性は、Tf-Idfベクトルのいわゆるコサイン類似性です similarity_filter = TfIdfCosine() # NLPのオブジェクトを設定します similarity_filter.nlp_base = nlp_base # 類似性がこの値を超えると、文は切り捨てられます similarity_filter.similarity_limit = 0.20 document = '人間がお互いにコミュニケーションを行うための自然発生的な言語である。「自然言語」に対置される語に「形式言語」「人工言語」がある。形式言語との対比では、その構文や意味が明確に揺るぎなく定められ利用者に厳格な規則の遵守を強いる(ことが多い)形式言語に対し、話者集団の社会的文脈に沿った曖昧な規則が存在していると考えられるものが自然言語である。自然言語には、規則が曖昧であるがゆえに、話者による規則の解釈の自由度が残されており、話者が直面した状況に応じて規則の解釈を変化させることで、状況を共有する他の話者とのコミュニケーションを継続する事が可能となっている。' # 自動要約のオブジェクト auto_abstractor = AutoAbstractor() # トークナイザーを設定します。 これは、MeCabを使用した日本語のトークナイザーです auto_abstractor.tokenizable_doc = MeCabTokenizer() # ドキュメントを抽象化およびフィルタリングするオブジェクト
def learn(self, document, tokenizable_doc=None, hidden_neuron_count=200, epochs=100, batch_size=100, learning_rate=1e-05, learning_attenuate_rate=0.1, attenuate_epoch=50, bptt_tau=8, weight_limit=0.5, dropout_rate=0.5, test_size_rate=0.3, cluster_num=10, max_iter=100): ''' Learning. Args: document: String of document. tokenizable_doc: is-a `TokenizableDoc`. hidden_neuron_count: The number of units in hidden layer. epochs: Epochs of Mini-batch. bath_size: Batch size of Mini-batch. learning_rate: Learning rate. learning_attenuate_rate: Attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`. attenuate_epoch: Attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`. Additionally, in relation to regularization, this class constrains weight matrixes every `attenuate_epoch`. bptt_tau: Refereed maxinum step `t` in Backpropagation Through Time(BPTT). weight_limit: Regularization for weights matrix to repeat multiplying the weights matrix and `0.9` until $\sum_{j=0}^{n}w_{ji}^2 < weight\_limit$. dropout_rate: The probability of dropout. test_size_rate: Size of Test data set. If this value is `0`, the cluster_num: The number of clusters. max_iter: Maximum number of iterations. ''' # The object of NLP. nlp_base = NlpBase() if tokenizable_doc is None: # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() else: nlp_base.tokenizable_doc = tokenizable_doc sentence_list = nlp_base.listup_sentence(document) all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token token_master_list = list(set(all_token_list)) vectorlizable_sentence = EncoderDecoder() vectorlizable_sentence.learn( sentence_list=sentence_list, token_master_list=token_master_list, hidden_neuron_count=hidden_neuron_count, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, learning_attenuate_rate=learning_attenuate_rate, attenuate_epoch=attenuate_epoch, bptt_tau=bptt_tau, weight_limit=weight_limit, dropout_rate=dropout_rate, test_size_rate=test_size_rate) self.__vectorlizable_sentence = vectorlizable_sentence self.__token_master_list = token_master_list feature_arr = vectorlizable_sentence.vectorize(sentence_list) self.__clusterable_doc = KMeans( cluster_num=cluster_num, max_iter=max_iter, init_noise_arr=np.random.normal(size=feature_arr.shape)) self.__labeled_arr = self.__clusterable_doc.learn(feature_arr) self.__sentence_list = sentence_list self.__batch_size = batch_size
def __init__(self, document, tokenizable_doc=None, hidden_neuron_count=200, epochs=100, batch_size=100, learning_rate=1e-05, learning_attenuate_rate=0.1, attenuate_epoch=50, bptt_tau=8, weight_limit=0.5, dropout_rate=0.5, test_size_rate=0.3, debug_mode=False): ''' Init. Args: document: String of document. tokenizable_doc: is-a `TokenizableDoc`. hidden_neuron_count: The number of units in hidden layer. epochs: Epochs of Mini-batch. bath_size: Batch size of Mini-batch. learning_rate: Learning rate. learning_attenuate_rate: Attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`. attenuate_epoch: Attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`. Additionally, in relation to regularization, this class constrains weight matrixes every `attenuate_epoch`. bptt_tau: Refereed maxinum step `t` in Backpropagation Through Time(BPTT). weight_limit: Regularization for weights matrix to repeat multiplying the weights matrix and `0.9` until $\sum_{j=0}^{n}w_{ji}^2 < weight\_limit$. dropout_rate: The probability of dropout. test_size_rate: Size of Test data set. If this value is `0`, the debug_mode: Debug mode or not. ''' if debug_mode is True: logger = getLogger("pydbm") handler = StreamHandler() handler.setLevel(DEBUG) logger.setLevel(DEBUG) logger.addHandler(handler) logger = getLogger("pysummarization") handler = StreamHandler() handler.setLevel(DEBUG) logger.setLevel(DEBUG) logger.addHandler(handler) # The object of NLP. nlp_base = NlpBase() if tokenizable_doc is None: # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() else: nlp_base.tokenizable_doc = tokenizable_doc sentence_list = nlp_base.listup_sentence(document) all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token token_master_list = list(set(all_token_list)) vectorlizable_sentence = EncoderDecoder() vectorlizable_sentence.learn( sentence_list=sentence_list, token_master_list=token_master_list, hidden_neuron_count=hidden_neuron_count, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, learning_attenuate_rate=learning_attenuate_rate, attenuate_epoch=attenuate_epoch, bptt_tau=bptt_tau, weight_limit=weight_limit, dropout_rate=dropout_rate, test_size_rate=test_size_rate) self.__vectorlizable_sentence = vectorlizable_sentence self.__token_master_list = token_master_list
def learn(self, document, tokenizable_doc, hidden_neuron_count=1000, training_count=1, batch_size=10, learning_rate=1e-03, seq_len=5, cluster_num=10, max_iter=100): ''' Learning. Args: document: String of document. tokenizable_doc: is-a `TokenizableDoc`. hidden_neuron_count: The number of units in hidden layer. training_count: The number of training. bath_size: Batch size of Mini-batch. learning_rate: Learning rate. seq_len: The length of one sequence. cluster_num: The number of clusters. max_iter: Maximum number of iterations. ''' # The object of NLP. nlp_base = NlpBase() if tokenizable_doc is None: # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() else: nlp_base.tokenizable_doc = tokenizable_doc sentence_list = nlp_base.listup_sentence(document) all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token token_master_list = list(set(all_token_list)) vectorlizable_sentence = LSTMRTRBM() vectorlizable_sentence.learn(sentence_list=sentence_list, token_master_list=token_master_list, hidden_neuron_count=hidden_neuron_count, training_count=training_count, batch_size=batch_size, learning_rate=learning_rate, seq_len=seq_len) feature_arr = vectorlizable_sentence.vectorize(sentence_list) self.__clusterable_doc = KMeans( cluster_num=cluster_num, max_iter=max_iter, init_noise_arr=np.random.normal(size=feature_arr.shape)) self.__labeled_arr = self.__clusterable_doc.learn(feature_arr) self.__vectorlizable_sentence = vectorlizable_sentence self.__token_master_list = token_master_list self.__sentence_list = sentence_list self.__batch_size = batch_size
def Main(url, similarity_mode="TfIdfCosine", similarity_limit=0.75): ''' Entry Point. Args: url: PDF url. ''' # The object of Web-scraping. web_scrape = WebScraping() # Set the object of reading PDF files. web_scrape.readable_web_pdf = WebPDFReading() # Execute Web-scraping. document = web_scrape.scrape(url) if similarity_mode == "EncoderDecoderCosine": # The object of `Similarity Filter`. # The similarity observed by this object is so-called cosine similarity of manifolds, # which is embedded in hidden layer of Encoder/Decoder based on LSTM. similarity_filter = EncoderDecoderCosine(document, hidden_neuron_count=200, epochs=100, batch_size=100, learning_rate=1e-05, learning_attenuate_rate=0.1, attenuate_epoch=50, bptt_tau=8, weight_limit=0.5, dropout_rate=0.5, test_size_rate=0.3, debug_mode=True) elif similarity_mode == "EncoderDecoderClustering": # The object of `Similarity Filter`. # The similarity is observed by checking whether each sentence belonging to the same cluster, # and if so, the similarity is `1.0`, if not, the value is `0.0`. # The data clustering algorithm is based on K-Means method, # learning data which is embedded in hidden layer of LSTM. similarity_filter = EncoderDecoderClustering( document, hidden_neuron_count=200, epochs=100, batch_size=100, learning_rate=1e-05, learning_attenuate_rate=0.1, attenuate_epoch=50, bptt_tau=8, weight_limit=0.5, dropout_rate=0.5, test_size_rate=0.3, cluster_num=10, max_iter=100, debug_mode=True) elif similarity_mode == "LSTMRTRBMCosine": # The object of `Similarity Filter`. # The similarity observed by this object is so-called cosine similarity of manifolds, # which is embedded in hidden layer of LSTM-RTRBM. similarity_filter = LSTMRTRBMCosine(document, training_count=1, hidden_neuron_count=100, batch_size=100, learning_rate=1e-03, seq_len=5, debug_mode=True) elif similarity_mode == "LSTMRTRBMClustering": # The object of `Similarity Filter`. # The similarity is observed by checking whether each sentence belonging to the same cluster, # and if so, the similarity is `1.0`, if not, the value is `0.0`. # The data clustering algorithm is based on K-Means method, # learning data which is embedded in hidden layer of LSTM-RTRBM. similarity_filter = LSTMRTRBMClustering(document, tokenizable_doc=None, hidden_neuron_count=1000, batch_size=100, learning_rate=1e-03, seq_len=5, cluster_num=10, max_iter=100, debug_mode=True) elif similarity_mode == "TfIdfCosine": # The object of `Similarity Filter`. # The similarity observed by this object is so-called cosine similarity of Tf-Idf vectors. similarity_filter = TfIdfCosine() elif similarity_mode == "Dice": # The object of `Similarity Filter`. # The similarity observed by this object is the Dice coefficient. similarity_filter = Dice() elif similarity_mode == "Jaccard": # The object of `Similarity Filter`. # The similarity observed by this object is the Jaccard coefficient. similarity_filter = Jaccard() elif similarity_mode == "Simpson": # The object of `Similarity Filter`. # The similarity observed by this object is the Simpson coefficient. similarity_filter = Simpson() else: raise ValueError() # The object of the NLP. nlp_base = NlpBase() # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() # Set the object of NLP. similarity_filter.nlp_base = nlp_base # If the similarity exceeds this value, the sentence will be cut off. similarity_filter.similarity_limit = similarity_limit # The object of automatic sumamrization. auto_abstractor = AutoAbstractor() # Set tokenizer. This is japanese tokenizer with MeCab. auto_abstractor.tokenizable_doc = MeCabTokenizer() # Object of abstracting and filtering document. abstractable_doc = TopNRankAbstractor() # Execute summarization. result_dict = auto_abstractor.summarize(document, abstractable_doc, similarity_filter) # Output summarized sentence. [ print(result_dict["summarize_result"][i]) for i in range(len(result_dict["summarize_result"])) if i < 3 ]