def __call__(self, document, sentences_count, user_dict): self._ensure_dependecies_installed() self.nlp_doc = self.nlp(document) self.user_dict = user_dict logger.info("Created doc") dictionary = self._create_dictionary() # empty document if not dictionary: return () matrix = self._create_matrix(dictionary) matrix = self._compute_term_frequency(matrix) u, sigma, v = singular_value_decomposition(matrix, full_matrices=False) ranks = iter(self._compute_ranks(sigma, v)) sents = [s.text for s in self.nlp_doc.sents] logger.info("Sentences generated by spacy are %s, count %s", sents, len(sents)) new_sents = self._get_best_sentences(sents, sentences_count * 2, lambda s: next(ranks)) filt_sents = [sent for sent in new_sents if self.better_question(sent)] additional_sents = set(new_sents) - set(filt_sents) to_add = sentences_count - len(filt_sents) final_sents = filt_sents if to_add > 0: final_sents += sorted(list(additional_sents)[:to_add], key=lambda x: len(x), reverse=True) logger.info("Filtered sentences %s", filt_sents) logger.info("Final recommendations are %s", final_sents[:sentences_count]) return final_sents
def lsa_summary(text, limit=1): [dictionary, proc_text, sentences] = save_word_dict(text) tf_matrix = create_tf_matrix(proc_text, dictionary) tf_matrix = normalize_tf_matrix(tf_matrix, 0.3) # decompose in U x S X V matrices using SVD [u, s, v] = singular_value_decomposition(tf_matrix, full_matrices=False) reduction_ratio = 1.0 dimension = len(s) reduced_dimension = int(dimension * reduction_ratio) min_dimension = 5 if(reduced_dimension < min_dimension): reduced_dimension = min_dimension s2 = numpy.array(s, copy=True) s2 = numpy.square(s2).tolist() for i in range(reduced_dimension, dimension): s2[i,:] *= 0.0 # http://textmining.zcu.cz/publications/PhDThesis-Steinberger.pdf # see page 25 - Sk = sqrt(sum(v * sigma^2 )) ranks = numpy.sqrt(numpy.square(v.T*s2).sum(axis=1)) ranked_sentences = sorted(range(len(ranks)),key=lambda x:ranks[x], reverse=True) result_summary = '' for i in range(0, limit): result_summary = result_summary + ' ' + sentences[ranked_sentences[i]] return result_summary
def __call__(self, document, sentences_count): dictionary = self.create_dictionary(document) matrix = self.create_matrix(document, dictionary) matrix = self.compute_term_frequency(matrix) #print("mat==============================================:\n",matrix) u, sigma, v = singular_value_decomposition(matrix, full_matrices=False) ranks = iter(self.compute_ranks(sigma, v)) #print("ranks**************************************\n",next(ranks)) return self.get_best_sentences(document.sentences, sentences_count,lambda s: next(ranks))
def __call__(self, doc, sent_count): dictionary = self._create_dictionary(doc) matrix = self._create_matrix(doc, dictionary) matrix = self._compute_term_frequency(matrix) u, sigma, v = singular_value_decomposition(matrix, full_matrices=False) ranks = iter(self._compute_ranks(sigma, v)) return self._get_best_sentences(doc.sentences, sent_count, lambda s: next(ranks))
def svdBpmMatrix(self,plane): #Computes SVD of the bpm_matrix global nturns bpm_matrix = self.peak_peak(plane) n_bpms = shape(bpm_matrix)[1] print 'performing SVD' #----svd for matrix with bpms >10 if n_bpms > 10: A = singular_value_decomposition(bpm_matrix, full_matrices=0) else: sys.exit('Exit, # of bpms < 10') return A
def svdClean(self, plane): global nturns, tx, ty print 'removing noise floor',plane if plane == 'x': b = tx[turn:,:] #truncate by the first 5 turns n_turns = shape(b)[0] elif plane == 'y': b = ty[turn:,:] #truncate by the first 5 turns n_turns = shape(b)[0] else: print "no tbt data acquired" b_mean = mean(b) b = (b-b_mean)/sqrt(n_turns) n_bpms = shape(b)[1] #----svd for matrix with bpms >10 if n_bpms > 10: A = singular_value_decomposition(b,full_matrices=0) #print "Singular values:",A[1] else: sys.exit('Exit, # of bpms < 10') #----SVD cut for noise floor if sing_val > n_bpms: svdcut = n_bpms print 'requested more singular values than available' print '# of sing_val used for', plane, '=', n_bpms else: svdcut = int(sing_val) print '# of sing_val used for', plane, '=', svdcut #print A[1][0] A[1][svdcut:] = 0. #temp=matrixmultiply(identity(len(A[1]))*A[1], A[2]) temp=matrixmultiply(diag(A[1]), A[2]) b = matrixmultiply(A[0],temp) ### check b = (b *sqrt(n_turns))+b_mean #b = b*sqrt(n_turns) if plane == 'x': tx[turn:,:] = b elif plane == 'y': ty[turn:,:] = b else: print "no tbt data to analyze" nturns = shape(tx)[0]
def __call__(self, document, sentences_count): self._ensure_dependecies_installed() dictionary = self._create_dictionary(document) if not dictionary: return () matrix = self._create_matrix(document, dictionary) matrix = self._compute_term_frequency(matrix) # print(matrix) u, sigma, v = singular_value_decomposition(matrix, full_matrices=False) ranks = iter(self._compute_ranks(sigma, v)) return self._get_best_sentences(document.sentences, sentences_count, lambda s: next(ranks))
def __call__(self, document, sentences_count): self._ensure_dependecies_installed() dictionary = self._create_dictionary(document) # empty document if not dictionary: return () matrix = self._create_matrix(document, dictionary) matrix = self._compute_term_frequency(matrix) u, sigma, v = singular_value_decomposition(matrix, full_matrices=False) ranks = iter(self._compute_ranks(sigma, v)) return self._get_best_sentences(document.sentences, sentences_count, lambda s: next(ranks))
def __call__(self, document, sentences_count): dictionary = self._create_dictionary(document) sentences = sent_tokenize(document) matrix = self._create_matrix(document, dictionary) matrix = self._compute_TfIdf(matrix) u, sigma, v = singular_value_decomposition(matrix, full_matrices=False) v = self._preprocess_matrix_V(v) ranks = iter(self._compute_ranks(v, sigma)) return self._get_best_sentences(sentences, sentences_count, lambda s: next(ranks))
def __call__(self, document, sentences_count): dictionary = self._create_dictionary(document) if not dictionary: return () sentences = [] for i in range(0, len(document)): li = sent_tokenize(document[i]) sentences.extend(li) # print(sentences) matrix = self._create_matrix(document, dictionary) matrix = self._compute_term_frequency(matrix) u, sigma, v = singular_value_decomposition(matrix, full_matrices=False) ranks = iter(self._compute_ranks(sigma, v)) return self._get_best_sentences(sentences, sentences_count, lambda s: next(ranks))
def start_lsa(article_id, limit, text, reference_summary): if(text == None): text = "Thomas A. Anderson is a man living two lives. By day he is an " + \ "average computer programmer and by night a hacker known as " + \ "Neo. Neo has always questioned his reality, but the truth is " + \ "far beyond his imagination. Neo finds himself targeted by the " + \ "police when he is contacted by Morpheus, a legendary computer " + \ "hacker branded a terrorist by the government. Morpheus awakens " + \ "Neo to the real world, a ravaged wasteland where most of " + \ "humanity have been captured by a race of machines that live " + \ "off of the humans' body heat and electrochemical energy and " + \ "who imprison their minds within an artificial reality known as " + \ "the Matrix. As a rebel against the machines, Neo must return to " + \ "the Matrix and confront the agents: super-powerful computer " + \ "programs devoted to snuffing out Neo and the entire human " + \ "rebellion." [dictionary, proc_text, sentences] = save_word_dict(text) tf_matrix = create_tf_matrix(proc_text, dictionary) tf_matrix = normalize_tf_matrix(tf_matrix, 0.3) # decompose in U x S X V matrices using SVD [u, s, v] = singular_value_decomposition(tf_matrix, full_matrices=False) reduction_ratio = 1.0 dimension = s.shape[0] reduced_dimension = int(dimension * reduction_ratio) min_dimension = 1 if(reduced_dimension < min_dimension): reduced_dimension = min_dimension s2 = numpy.array(s, copy=True) s2 = numpy.square(s2) if(reduced_dimension < dimension): for i in range(reduced_dimension, dimension): s2[i] = 0 # http://textmining.zcu.cz/publications/PhDThesis-Steinberger.pdf # see page 25 - Sk = sqrt(sum(v * sigma^2 )) ranks = numpy.sqrt(numpy.square(v.T*s2).sum(axis=1)) #print "ranks " , ranks ranked_sentences = sorted(range(len(ranks)),key=lambda x:ranks[x], reverse=True) #print "ranked_sentences ", ranked_sentences result_summary = '' for i in range(0, limit): result_summary = result_summary + ' ' + sentences[ranked_sentences[i]] system_summary = result_summary # if(reference_summary != None): # try: # reference_summary = summarize(text) # except (ValueError, ZeroDivisionError): # return -1 # if(reference_summary == None or len(reference_summary) == 0 or len(reference_summary) > 140): # return -1 # write reference summary to file sys_dir = os.pardir + "/test-summarization/system/" + article_id + "_" + "system.txt" ref_dir = os.pardir + "/test-summarization/reference/" + article_id + "_" + "reference.txt" write_to_file(ref_dir, reference_summary) reference_summary_list.append([ref_dir]) # write system summary to file write_to_file(sys_dir, system_summary) system_summary_list.append(sys_dir) # test_print(reference_summary, system_summary) return ranked_sentences
def LSAPlus_SumPlus(doc): # SumPlus sumbasic_sents = [] for text in doc: tsummarizer_w_stops = SumBasicSummarizer() tsummarizer_w_stops.stop_words = get_stop_words('english') parser = PlaintextParser.from_string(text, Tokenizer('english')) dictionary = tsummarizer_w_stops._compute_ratings( parser.document.sentences) sumbasic_sents_entries = [] for sent in dictionary: sumbasic_sents_entries.append(sent) sumbasic_sents.append(sumbasic_sents_entries) #LSAPlus lsa_sents = [] for text in doc: l2summarizer = LsaSummarizer() parser = PlaintextParser.from_string(text, Tokenizer('english')) dictionary = (l2summarizer._create_dictionary(parser.document)) matrix = l2summarizer._create_matrix(parser.document, dictionary) matrix2 = l2summarizer._compute_term_frequency(matrix) u, sigma, v = singular_value_decomposition(matrix2, full_matrices=False) v_sorted = sorted(abs(v[:, 0]), reverse=True) v_indices = [] for i in v_sorted: v_indices.append(list(v_sorted).index(i)) sents = np.array(list(parser.document.sentences)) sents[np.array(v_indices)] lsa_sents_entries = list(sents) lsa_sents.append(lsa_sents_entries) # Combining SumPlus and LSAPlus import math num_sentences = len(sumbasic_sents) all_sents_removed_parent2 = [] for entry in range(num_sentences): num_sents_to_remove = math.ceil(len(sumbasic_sents[entry]) / 2) sent_len = len(sumbasic_sents[entry]) sb = sumbasic_sents[entry][sent_len - num_sents_to_remove:sent_len] lsa = lsa_sents[entry][sent_len - num_sents_to_remove:sent_len] # Checking if Sentences are ranked bad by BOTH LSAPlus and SumPlus sents_removed3 = [] for sent in lsa: if (sent in sb): sents_removed3.append(sent) # Setences to be Trimmed Off all_sents_removed_parent2.append(sents_removed3) sents_to_keep_parent2 = [] for i in range(len(doc)): parser = PlaintextParser.from_string(doc[i], Tokenizer('english')) sents = parser.document.sentences # Sentences not Trimmed Off sents_to_keep2 = [ sentence for sentence in sents if sentence not in all_sents_removed_parent2[i] ] # Appending Trimmed Text for Each Entry sents_to_keep_parent2.append(sents_to_keep2) # Trimmed Text sentence_parent2 = [] for text in sents_to_keep_parent2: sentence = "" for sent in text: sentence = sentence + " " + str(sent) sentence_parent2.append(sentence) return sentence_parent2
def run(self): self.signals.UpdateProgressBar.emit(0) # Загрузка текста self.text = TextData(self.filename) self.text.original_sentences = readSentencesFromInputText(self.filename, None) original_sentences = tuple(self.text.original_sentences) self.signals.UpdateProgressBar.emit(5) # Разделение текста на слова self.configurations["minimal_words_in_sentence"] = 4 self.configurations['need_agresive_filtration'] = True self.text.tokenized_sentences = tokenizeSingleText(self.text, self.configurations) # Удаление стоп-слов self.configurations["minimal_word_size"] = 3 self.text.no_stop_words_sentences = removeStopWordsFromSentences(self.text.tokenized_sentences, self.morph, self.configurations) if len(self.text.no_stop_words_sentences) > 0: np.set_printoptions(suppress=False) self.signals.UpdateProgressBar.emit(20) # Нормализация texts, log_string = normalizeTexts([self.text], self.morph) self.text = texts[0] # Приведение регистра texts, log_string = fixRegisterInTexts(texts, self.morph) self.text = texts[0] self.signals.UpdateProgressBar.emit(30) # Расчет частотной таблицы слов texts, log_string = calculateWordsFrequencyInTexts(texts) self.text = texts[0] self.signals.UpdateProgressBar.emit(40) matrix, all_word_keys = self.CreateLSAMatrixForSummarization(self.text) matrix = self._compute_term_frequency(matrix) self.signals.UpdateProgressBar.emit(50) u, sigma, v = singular_value_decomposition(matrix, full_matrices=False) u = u + np.abs(np.min(u)) v = v + np.abs(np.min(v)) u, sigma, v = self.cutSingularValue(u, sigma, v) self.signals.UpdateProgressBar.emit(70) if(self.calculation_method == AnnotationMakerCalculator.METHOD_BY_SENTENCE_VALUE): self.calculateBySentenceValues(v, self.result_sentence_count) if (self.calculation_method == AnnotationMakerCalculator.METHOD_BY_WORDS_SUM): self.calculateByWordsValues(all_word_keys, u, self.result_sentence_count) self.signals.PrintInfo.emit('\nУспешно завершено.') else: self.signals.PrintInfo.emit('\nНедостаточно входных данных и/или много неликвидных данных.') self.signals.UpdateProgressBar.emit(100) self.signals.Finished.emit()
def start_lsa(article_id, limit, text, reference_summary): if (text == None): text = "Thomas A. Anderson is a man living two lives. By day he is an " + \ "average computer programmer and by night a hacker known as " + \ "Neo. Neo has always questioned his reality, but the truth is " + \ "far beyond his imagination. Neo finds himself targeted by the " + \ "police when he is contacted by Morpheus, a legendary computer " + \ "hacker branded a terrorist by the government. Morpheus awakens " + \ "Neo to the real world, a ravaged wasteland where most of " + \ "humanity have been captured by a race of machines that live " + \ "off of the humans' body heat and electrochemical energy and " + \ "who imprison their minds within an artificial reality known as " + \ "the Matrix. As a rebel against the machines, Neo must return to " + \ "the Matrix and confront the agents: super-powerful computer " + \ "programs devoted to snuffing out Neo and the entire human " + \ "rebellion." [dictionary, proc_text, sentences] = save_word_dict(text) tf_matrix = create_tf_matrix(proc_text, dictionary) tf_matrix = normalize_tf_matrix(tf_matrix, 0.3) # decompose in U x S X V matrices using SVD [u, s, v] = singular_value_decomposition(tf_matrix, full_matrices=False) reduction_ratio = 1.0 dimension = s.shape[0] reduced_dimension = int(dimension * reduction_ratio) min_dimension = 1 if (reduced_dimension < min_dimension): reduced_dimension = min_dimension s2 = numpy.array(s, copy=True) s2 = numpy.square(s2) if (reduced_dimension < dimension): for i in range(reduced_dimension, dimension): s2[i] = 0 # http://textmining.zcu.cz/publications/PhDThesis-Steinberger.pdf # see page 25 - Sk = sqrt(sum(v * sigma^2 )) ranks = numpy.sqrt(numpy.square(v.T * s2).sum(axis=1)) #print "ranks " , ranks ranked_sentences = sorted(range(len(ranks)), key=lambda x: ranks[x], reverse=True) #print "ranked_sentences ", ranked_sentences result_summary = '' for i in range(0, limit): result_summary = result_summary + ' ' + sentences[ranked_sentences[i]] system_summary = result_summary # if(reference_summary != None): # try: # reference_summary = summarize(text) # except (ValueError, ZeroDivisionError): # return -1 # if(reference_summary == None or len(reference_summary) == 0 or len(reference_summary) > 140): # return -1 # write reference summary to file sys_dir = os.pardir + "/test-summarization/system/" + article_id + "_" + "system.txt" ref_dir = os.pardir + "/test-summarization/reference/" + article_id + "_" + "reference.txt" write_to_file(ref_dir, reference_summary) reference_summary_list.append([ref_dir]) # write system summary to file write_to_file(sys_dir, system_summary) system_summary_list.append(sys_dir) # test_print(reference_summary, system_summary) return ranked_sentences
def lsa_text_extraction(textdoc, smooth=0.4, MIN_DIMENSIONS=3, REDUCTION_RATIO=1 / 1, topn=5): """ reduction_ratio: used to reduce computation cost: limit diagonal size, when it is 1 it keeps original diagonal size, when it is 0.4 only keep 0.4 * original diagonal size smooth: is a factor appened to matrix normalization, small value might cause overfitting and large value might cause underfitting """ ''' document to sentences ''' tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') document = tokenizer.tokenize(textdoc) ''' generate term freq matrix ''' assert 0.0 <= smooth < 1.0 preprocessed_text = textClean.pipeline(document, multi_gram=[1], lower_case=True, deacc=False, encoding='utf8', errors='strict', stem_lemma='lemma', tag_drop=[], nltk_stop=True, stop_word_list=[], check_numbers=False, word_length=2, remove_consecutives=True) dictionary = DocVector.generate_corpus_dict(preprocessed_text, no_below=2, no_above=0.5, keep_n=100000) doc_vec = DocVector.create_document_vector(preprocessed_text, dictionary) tfmatrix = DocVector.get_vocab_matrix(doc_vec, dictionary) matrix_copy = tfmatrix.values.T ''' Computes TF metrics for each sentence (column) in the given matrix and normalize the tf weights of all terms occurring in a document by the maximum tf in that document according to ntf_{t,d} = a + (1-a)\frac{tf_{t,d}}{tf_{max}(d)^{'}}. The smoothing term $a$ damps the contribution of the second term - which may be viewed as a scaling down of tf by the largest tf value in $d$ ''' max_word_frequencies = np.max(matrix_copy, axis=0) rows, cols = matrix_copy.shape for row in range(rows): for col in range(cols): max_word_frequency = max_word_frequencies[col] if max_word_frequency != 0: frequency = matrix_copy[row, col] / max_word_frequency matrix_copy[row, col] = smooth + (1.0 - smooth) * frequency ''' get ranks ''' u, sigma, v_matrix = singular_value_decomposition(matrix_copy, full_matrices=False) assert len(sigma) == v_matrix.shape[0] dimensions = max(MIN_DIMENSIONS, int(len(sigma) * REDUCTION_RATIO)) powered_sigma = tuple(s**2 if i < dimensions else 0.0 for i, s in enumerate(sigma)) ranks = [] for column_vector in v_matrix.T: rank = sum(s * v**2 for s, v in zip(powered_sigma, column_vector)) ranks.append(math.sqrt(rank)) ''' output result ''' percentile_list = pd.DataFrame({ 'sentence': document, 'rank': ranks, }).sort_values(by='rank', ascending=False) output_sentence = [i for i in percentile_list.head(topn)['sentence']] return output_sentence
def computeSVD(self, B): return singular_value_decomposition(B,full_matrices=0)