def Main(url): ''' Entry Point. Args: url: target url. ''' # The object of Web-Scraping. web_scrape = WebScraping() # Execute Web-Scraping. document = web_scrape.scrape(url) # The object of NLP. nlp_base = NlpBase() # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() sentence_list = nlp_base.listup_sentence(document) batch_size = 10 if len(sentence_list) < batch_size: raise ValueError("The number of extracted sentences is insufficient.") all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token vectorlizable_sentence = LSTMRTRBM() vectorlizable_sentence.learn(sentence_list=sentence_list, token_master_list=list(set(all_token_list)), hidden_neuron_count=1000, batch_size=batch_size, learning_rate=1e-03, seq_len=5) test_list = sentence_list[:batch_size] feature_points_arr = vectorlizable_sentence.vectorize(test_list) print("Feature points (Top 5 sentences):") print(feature_points_arr)
def Main(url): ''' Entry Point. Args: url: target url. ''' # The object of Web-Scraping. web_scrape = WebScraping() # Execute Web-Scraping. document = web_scrape.scrape(url) # The object of NLP. nlp_base = NlpBase() # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() sentence_list = nlp_base.listup_sentence(document) all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token vectorlizable_sentence = EncoderDecoder() vectorlizable_sentence.learn( sentence_list=sentence_list, token_master_list=list(set(all_token_list)), epochs=60 ) test_list = sentence_list[:5] feature_points_arr = vectorlizable_sentence.vectorize(test_list) reconstruction_error_arr = vectorlizable_sentence.controller.get_reconstruction_error().mean() print("Feature points (Top 5 sentences):") print(feature_points_arr) print("Reconstruction error(MSE):") print(reconstruction_error_arr)
def learn(self, document, tokenizable_doc=None, hidden_neuron_count=200, epochs=100, batch_size=100, learning_rate=1e-05, learning_attenuate_rate=0.1, attenuate_epoch=50, bptt_tau=8, weight_limit=0.5, dropout_rate=0.5, test_size_rate=0.3, cluster_num=10, max_iter=100): ''' Learning. Args: document: String of document. tokenizable_doc: is-a `TokenizableDoc`. hidden_neuron_count: The number of units in hidden layer. epochs: Epochs of Mini-batch. bath_size: Batch size of Mini-batch. learning_rate: Learning rate. learning_attenuate_rate: Attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`. attenuate_epoch: Attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`. Additionally, in relation to regularization, this class constrains weight matrixes every `attenuate_epoch`. bptt_tau: Refereed maxinum step `t` in Backpropagation Through Time(BPTT). weight_limit: Regularization for weights matrix to repeat multiplying the weights matrix and `0.9` until $\sum_{j=0}^{n}w_{ji}^2 < weight\_limit$. dropout_rate: The probability of dropout. test_size_rate: Size of Test data set. If this value is `0`, the cluster_num: The number of clusters. max_iter: Maximum number of iterations. ''' # The object of NLP. nlp_base = NlpBase() if tokenizable_doc is None: # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() else: nlp_base.tokenizable_doc = tokenizable_doc sentence_list = nlp_base.listup_sentence(document) all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token token_master_list = list(set(all_token_list)) vectorlizable_sentence = EncoderDecoder() vectorlizable_sentence.learn( sentence_list=sentence_list, token_master_list=token_master_list, hidden_neuron_count=hidden_neuron_count, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, learning_attenuate_rate=learning_attenuate_rate, attenuate_epoch=attenuate_epoch, bptt_tau=bptt_tau, weight_limit=weight_limit, dropout_rate=dropout_rate, test_size_rate=test_size_rate) self.__vectorlizable_sentence = vectorlizable_sentence self.__token_master_list = token_master_list feature_arr = vectorlizable_sentence.vectorize(sentence_list) self.__clusterable_doc = KMeans( cluster_num=cluster_num, max_iter=max_iter, init_noise_arr=np.random.normal(size=feature_arr.shape)) self.__labeled_arr = self.__clusterable_doc.learn(feature_arr) self.__sentence_list = sentence_list self.__batch_size = batch_size
def __init__(self, document, tokenizable_doc=None, hidden_neuron_count=200, epochs=100, batch_size=100, learning_rate=1e-05, learning_attenuate_rate=0.1, attenuate_epoch=50, bptt_tau=8, weight_limit=0.5, dropout_rate=0.5, test_size_rate=0.3, debug_mode=False): ''' Init. Args: document: String of document. tokenizable_doc: is-a `TokenizableDoc`. hidden_neuron_count: The number of units in hidden layer. epochs: Epochs of Mini-batch. bath_size: Batch size of Mini-batch. learning_rate: Learning rate. learning_attenuate_rate: Attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`. attenuate_epoch: Attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`. Additionally, in relation to regularization, this class constrains weight matrixes every `attenuate_epoch`. bptt_tau: Refereed maxinum step `t` in Backpropagation Through Time(BPTT). weight_limit: Regularization for weights matrix to repeat multiplying the weights matrix and `0.9` until $\sum_{j=0}^{n}w_{ji}^2 < weight\_limit$. dropout_rate: The probability of dropout. test_size_rate: Size of Test data set. If this value is `0`, the debug_mode: Debug mode or not. ''' if debug_mode is True: logger = getLogger("pydbm") handler = StreamHandler() handler.setLevel(DEBUG) logger.setLevel(DEBUG) logger.addHandler(handler) logger = getLogger("pysummarization") handler = StreamHandler() handler.setLevel(DEBUG) logger.setLevel(DEBUG) logger.addHandler(handler) # The object of NLP. nlp_base = NlpBase() if tokenizable_doc is None: # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() else: nlp_base.tokenizable_doc = tokenizable_doc sentence_list = nlp_base.listup_sentence(document) all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token token_master_list = list(set(all_token_list)) vectorlizable_sentence = EncoderDecoder() vectorlizable_sentence.learn( sentence_list=sentence_list, token_master_list=token_master_list, hidden_neuron_count=hidden_neuron_count, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, learning_attenuate_rate=learning_attenuate_rate, attenuate_epoch=attenuate_epoch, bptt_tau=bptt_tau, weight_limit=weight_limit, dropout_rate=dropout_rate, test_size_rate=test_size_rate) self.__vectorlizable_sentence = vectorlizable_sentence self.__token_master_list = token_master_list
def learn(self, document, tokenizable_doc, hidden_neuron_count=1000, training_count=1, batch_size=10, learning_rate=1e-03, seq_len=5, cluster_num=10, max_iter=100): ''' Learning. Args: document: String of document. tokenizable_doc: is-a `TokenizableDoc`. hidden_neuron_count: The number of units in hidden layer. training_count: The number of training. bath_size: Batch size of Mini-batch. learning_rate: Learning rate. seq_len: The length of one sequence. cluster_num: The number of clusters. max_iter: Maximum number of iterations. ''' # The object of NLP. nlp_base = NlpBase() if tokenizable_doc is None: # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() else: nlp_base.tokenizable_doc = tokenizable_doc sentence_list = nlp_base.listup_sentence(document) all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token token_master_list = list(set(all_token_list)) vectorlizable_sentence = LSTMRTRBM() vectorlizable_sentence.learn(sentence_list=sentence_list, token_master_list=token_master_list, hidden_neuron_count=hidden_neuron_count, training_count=training_count, batch_size=batch_size, learning_rate=learning_rate, seq_len=seq_len) feature_arr = vectorlizable_sentence.vectorize(sentence_list) self.__clusterable_doc = KMeans( cluster_num=cluster_num, max_iter=max_iter, init_noise_arr=np.random.normal(size=feature_arr.shape)) self.__labeled_arr = self.__clusterable_doc.learn(feature_arr) self.__vectorlizable_sentence = vectorlizable_sentence self.__token_master_list = token_master_list self.__sentence_list = sentence_list self.__batch_size = batch_size