def get_info(path): #function to get number of pages of input pdf with open(path, 'rb') as f: pdf = PdfFileReader(f) print(pdf) number_of_pages = pdf.getNumPages() text = convert_pdf_to_txt(path, pages=range(0,number_of_pages)) LANGUAGE = "english" SENTENCES_COUNT = 10 #url = "https://en.wikipedia.org/wiki/Automatic_summarization" #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain strings parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) #PlaintextParser.from_file for files stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("SUMMARY") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) #print(keywords.keywords(text)) #original keywords keywordlist1=keywords.keywords(text,split=True) #list of keywords keywordstring=" ".join(keywordlist1) #string of keywords tokens= nltk.word_tokenize(keywordstring) print("\n","KEYWORDS") print("\n",{stem(t) for t in tokens},"\n") # gives set of unique kywords
def get_info(path): with open(path, 'r') as f: text = f.read() LANGUAGE = "english" SENTENCES_COUNT = 10 #url = "https://en.wikipedia.org/wiki/Automatic_summarization" #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain strings parser = PlaintextParser.from_string( text, Tokenizer(LANGUAGE)) #PlaintextParser.from_file for files stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("SUMMARY") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) #print(keywords.keywords(text)) #original keywords keywordlist1 = keywords.keywords(text, split=True) #list of keywords keywordstring = " ".join(keywordlist1) #string of keywords tokens = nltk.word_tokenize(keywordstring) print("\n", "KEYWORDS") print("\n", {stem(t) for t in tokens}, "\n") # gives set of unique kywords
def create_lsa_summary(): for filename in os.listdir(TREES_SOURCE_PATH): if filename.endswith('.xml'): name = re.search('topic_(.*)\.xml', filename) input_path = create_input(name.group(1)) with open(SUM_PATH + name.group(1) + '_LSA_Group5.txt', 'w', encoding='utf8') as summary: summary.write("====================== General Summary of " + name.group(1) + " ======================\n") parser = PlaintextParser.from_file(input_path, Tokenizer("english")) summarizer = LsaSummarizer() s = summarizer(parser.document, 4) for sentence in s: summary.write(str(sentence) + '\n')
def create_textrank_summary(max_chars): for filename in os.listdir(TREES_SOURCE_PATH): chars = 0 if filename.endswith('.xml'): name = re.search('topic_(.*)\.xml', filename) input_path = create_input(name.group(1)) path = SUM_PATH + name.group(1) with open(path + '_TextRank_Group5.txt', 'w', encoding='utf8') as summary: summary.write("====================== General Summary of " + name.group(1) + " ======================\n") parser = PlaintextParser.from_file(input_path, Tokenizer("english")) summarizer = TextRankSummarizer() s = summarizer(parser.document, 3) for sentence in s: if (chars + len(str(sentence)) <= max_chars): summary.write(str(sentence) + '\n') chars += len(str(sentence)) expand_textrank(name.group(1), path, max_chars, chars)