def summarizeText(self, text, lines): length = len(text) text_div1 = text[0:int(length / 2)] text_div2 = text[int(length / 2):int(length - 1)] textrank_div1 = TextRank(text_div1) textrank_div2 = TextRank(text_div2) textresult_div1 = textrank_div1.summarize(10) textresult_div2 = textrank_div2.summarize(10) textresult = textresult_div1 + textresult_div2 resultrank = TextRank(textresult) return resultrank.summarize(lines)
def summarize(self, text, n_sents=3): """ Summarize a given text and get top sentences """ try: prediction = dict() if text: if self.lang_code in self.valid_langs: if Utility.get_doc_length(text) > self.n_words: # generate sentences, normalized sentences from text sents, norm_sents = self.p.text_preprocessing(text) # generate doc-term-matrix, term-doc-matrix dt_matrix = self.generate_doc_term_matrix(norm_sents) td_matrix = self.generate_term_doc_matrix(dt_matrix) if self.method == "LSA": lsa = LSA(self.k, td_matrix) term_topic_matrix, singular_values, topic_doc_matrix = lsa.u, lsa.s, lsa.vt # remove singular values below given treshold singular_values = lsa.filter_singular_values( singular_values, self.sv_threshold) # get salience scores from top singular values & topic document matrix salience_scores = lsa.get_salience_scores( singular_values, topic_doc_matrix) # get the top sentence indices for summarization top_sentence_indices = lsa.get_top_sent_indices( salience_scores, n_sents) summary = self.generate_summary( sents, top_sentence_indices) elif self.method == "TEXT_RANK": tr = TextRank(dt_matrix, td_matrix) # build similarity graph similarity_matrix = tr.similiarity_matrix similarity_graph = tr.get_similarity_graph( similarity_matrix) # compute pagerank scores for all sentences ranked_sents = tr.rank_sentences(similarity_graph) # get the top sentence indices for summarization top_sentence_indices = tr.get_top_sentence_indices( ranked_sents, n_sents) summary = self.generate_summary( sents, top_sentence_indices) else: return "no method found" # apply cleaning for readability summary = Utility.remove_multiple_whitespaces(summary) summary = Utility.remove_trailing_whitespaces(summary) prediction["summary"] = summary prediction["message"] = "successful" else: return "required at least {} words".format( self.n_words) else: return "language not supported".format() else: return "required textual content" return prediction except Exception: logging.error("exception occured", exc_info=True)
def summarizeTextList(self, textList, lines): length = len(textList) textList1 = textList[0:int(length / 2)] textList2 = textList[int(length / 2):int(length - 1)] text1 = '' text2 = '' for sentence in textList1: text1 += sentence + ' ' for sentence in textList2: text2 += sentence + ' ' textrank1 = TextRank(text1) textrank2 = TextRank(text2) textresult1 = textrank1.summarize(10) textresult2 = textrank2.summarize(10) textresult = textresult1 + textresult2 resultrank = TextRank(textresult) return resultrank.summarize(lines)
def select_algorithm(algo, text, num): if algo == 'Wordfreq': obj = WordFrequency(text, num) elif algo == 'TextRank': obj = TextRank(text, num) elif algo == 'TF_IDF': obj = TF_IDF(text, num) return obj.summarize_text() #def download_file(): # filename = 'summary.pdf' # file_dir = 'files' # return send_from_directory(file_dir, filename=filename))
def top_topics_from_lm(): print("top_topics_from_lm") LM = load_pickled_data("LM_classifier_10.pickle") voca = load_voca() corpus_info = CorpusInfo() articles = load_articles() ids, docs = zip(*articles) print("Predicting") labels = LM.predict_parallel(docs) cont_ids, _ = zip(*filter(lambda x: x[1], zip(ids, labels))) cont_ids = set(cont_ids) print("{}% are controversial".format(len(cont_ids) / len(docs))) save_pickle_data(cont_ids, "cont_ids") print("Initialize TextRank") text_rank = TextRank(docs, voca) scorer = LM.token_odd payloads = [] for id, doc in articles: if id in cont_ids: param = (doc, scorer, corpus_info, text_rank, 4) payloads.append(param) n_thread = 30 p = Pool(n_thread) g_phrase_score = Counter() print("Mapping") for phrase_score in p.map(top_phrase_by_scorer, payloads): for phrase, score in phrase_score: g_phrase_score[phrase] += score textrize = get_textrizer_plain(voca) result = [] for phrase, score in g_phrase_score.most_common(300): plain_phrase = textrize(str2arr(phrase)) print("{}\t{}".format(plain_phrase, score)) result.append((plain_phrase, score)) save_pickle_data(result, "cont_topics_lm.pickle")
text = re.sub("\u3000", "", text) text_list.append(text) sample_text = text_list[1] # set up the text rank parameters allowPOS = ["n"] stopwords = ["为了"] span = 3 # This part implements the Text rank algorithm ## initilization tr_keyword = TextRank(allowPOS, stopwords, span) ## implement the text rank tr_keyword.text_rank(sample_text, 10) # Text relationship Study ## cut the text into list of words word_pair = tr_keyword._cut(sample_text) ## create the co-occurance matrix (this is for ) co_graph = tr_keyword.co_occurance_matrix(word_pair) df = tr_keyword.co_occur_graph_to_matrix(co_graph, normalization=True) ## Visulize the text graph
def setUp(self): self.text_rank = TextRank()
from flask_migrate import Migrate from sqlalchemy import create_engine from settings import Settings from models import database, Feedback, Account from account_service import AccountService from feedback_service import FeedbackService from text_rank import TextRank debug = os.environ.get("DEBUG", "false").lower() == "true" ENV = os.environ.get("ENV", "dev") DD_API_URL = "https://api.datadoghq.com/api/v1/" log = logging.getLogger("summarizer_server") app = Flask(__name__) textrank = TextRank() textrank.setup() # db setup settings = Settings() app.config.from_object(settings) database.init_app(app) engine = create_engine(settings.SQLALCHEMY_DATABASE_URI) database.metadata.create_all(engine) accountservice = AccountService() feedbackservice = FeedbackService() @app.route("/v1/")
def load_text_rank(self): tr = TextRank() tr.generate_ranks() self.node_weights = tr.node_weights
def test_text_rank(sentences): ranker = TextRank(sentences) return ranker.rank()
def text_rank_extract_abstract(sentences, k=5): ranker = TextRank(sentences) rank_list = ranker.rank()[:k] rank_list = sorted(rank_list, key=lambda item: item['index']) return '\n'.join([''.join(item['sentence']) for item in rank_list])