def __init__(): LANGUAGE = "english" SENTENCES_COUNT = 1 stemmer = Stemmer(LANGUAGE) lsaSummarizer = Lsa(stemmer) lsaSummarizer.stop_words = get_stop_words(LANGUAGE) luhnSummarizer = Luhn(stemmer) luhnSummarizer.stop_words = get_stop_words(LANGUAGE) # edmundsonSummarizer.bonus_words = get_bonus_words lexrankSummarizer = LexRank(stemmer) lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) textrankSummarizer = TxtRank(stemmer) textrankSummarizer.stop_words = get_stop_words(LANGUAGE) sumbasicSummarizer = SumBasic(stemmer) sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) klSummarizer = KL(stemmer) klSummarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE)) allvariations = [] for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): # print("Summarizing text via LSA: ") print((str(sentence))) allvariations.append(sentence) for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Luhn: ") print(str(sentence)) allvariations.append(sentence) for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Lexrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Textrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Sumbasic: ") print(str(sentence)) allvariations.append(sentence) for sentence in klSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via klSum: ") print(str(sentence)) allvariations.append(sentence) return allvariations
def summarize(content, language): parser = PlaintextParser.from_string(content, Tokenizer(language)) lsa = Lsa() summary = lsa(parser.document, "30%") text = "" for sentence in summary: text = str(text) + str(sentence) return text
# Check : print('Created PDF file: ' + outputPDFDir + pdfFileName) # Calling convert function and writing each chapter to the .txt file txtOutputFile = open(outputTXTDir + txtFileName, 'w') txtOutputFile.write(convert(outputPDFDir + pdfFileName)) txtOutputFile.close() # Check :print('Created TXT file: ' + outputTXTDir + txtFileName) # for plain text files create Summary parser = PlaintextParser.from_file(outputTXTDir + txtFileName, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) # Using LsaSummarizer to create summary ## Select from different algorithms to create summary by using different algorithms if chooseAlgo == 'Lsa': summarizer = Lsa(stemmer) elif chooseAlgo == 'LexRank': summarizer = LexRank(stemmer) elif chooseAlgo == 'TextRank': summarizer = TextRank(stemmer) elif chooseAlgo == 'Luhn': summarizer = Luhn(stemmer) elif chooseAlgo == 'SumBasic': summarizer = SumBasic(stemmer) elif chooseAlgo == 'KLsum': summarizer = KLsum(stemmer) else: print('Wrong Algorithm selected.') sys.exit(0) summarizer.stop_words = get_stop_words(LANGUAGE)
if len(sentences) > 35: # trim off super short - likely a few word sentences ref_sentences.append(sentences) except TypeError: #catch type errors caused by annotated text ie h1, b, etc print("typeError") continue trim_ref_sentences.extend( Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) #define summarizers for the summarizing methods being used summarizer_Lsa = Lsa(stemmer) summarizer_Lsa.stop_words = get_stop_words(LANGUAGE) summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT) summarizer_LexRank = LexRank() summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT) summarizer_Edmundson = Edmundson(stemmer) summarizer_Edmundson.null_words = get_stop_words(LANGUAGE) summarizer_Edmundson.bonus_words = parser.significant_words summarizer_Edmundson.stigma_words = parser.stigma_words summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT) #store summaries in a text file #list_of_sums = [summary_Lsa, summary_LexRank, summary_Edmundson] #f = open('summarized.txt', 'w')
def summary(article_url): url = article_url #url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/potato" # url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/cabbage" # url = "http://www.encyclopedia.com/medicine/diseases-and-conditions/pathology/accident" # url = "http://www.encyclopedia.com/earth-and-environment/atmosphere-and-weather/atmospheric-and-space-sciences-atmosphere/air" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # create a list of reference sentences to calculate ROUGE_N scores ref_sentences = [] trim_ref_sentences = [] for paragraph in parser._article.main_text: for sections in paragraph: for sentences in sections: try: if len(sentences) > 35: # trim off super short - likely a few word sentences ref_sentences.append(sentences) except TypeError: # catch type errors caused by annotated text ie h1, b, etc print("typeError") continue trim_ref_sentences.extend( Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) # define summarizers for the summarizing methods being used summarizer_Lsa = Lsa(stemmer) summarizer_Lsa.stop_words = get_stop_words(LANGUAGE) summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT) summarizer_LexRank = LexRank() summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT) summarizer_Edmundson = Edmundson(stemmer) summarizer_Edmundson.null_words = get_stop_words(LANGUAGE) summarizer_Edmundson.bonus_words = parser.significant_words summarizer_Edmundson.stigma_words = parser.stigma_words summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT) # print summaries summary_Lsa_trim = [] for sentence in summary_Lsa: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_Lsa_trim.append(sentence) # calc rouge_n scores calc_value(summary_Lsa_trim, trim_ref_sentences) print('\n') summary_LexRank_trim = [] for sentence in summary_LexRank: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_LexRank_trim.append(sentence) # calc rouge_n scores calc_value(summary_LexRank_trim, trim_ref_sentences) print('\n') summary_Edmundson_trim = [] for sentence in summary_Edmundson: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_Edmundson_trim.append(sentence) # calc rouge_n scores calc_value(summary_Edmundson_trim, trim_ref_sentences) # returns index of max 0=Ed, 1=Lsa, 2=Lex models = {0: "Edmundson Model", 1: "Lsa Model", 2: "LexRank Model"} best_summary = max_r_value(summary_Lsa_trim, summary_LexRank_trim, summary_Edmundson_trim, trim_ref_sentences) print( models.get(best_summary) + ' is the best model according to an average of the Rouge_3, 2 and 1 tests' ) #return the summary of the best model if (best_summary == 0): return summary_Edmundson_trim elif (best_summary == 1): return summary_Lsa_trim elif (best_summary == 2): return summary_LexRank_trim
def main(argv): # Leer parametros archivo entrada y directorio salida try: opts, args = getopt.getopt(argv, "i:o:", ["inputFile=", "outputDirectory="]) except getopt.GetoptError: print('main.py -i <inputFile> -o <outputDirectory>') sys.exit(2) if (len(opts) != 2): print('main.py -i <inputFile> -o <outputDirectory>') sys.exit(2) PDF_SummaryDir = '' sourcePDFFile = '' for opt, arg in opts: if opt == '-h': print('main.py -i <inputFile> -o <outputDirectory>') sys.exit() elif opt in ("-i", "--inputFile"): sourcePDFFile = arg if os.path.exists(sourcePDFFile): print('[+] Archivo PDF encontrado') elif opt in ("-o", "--outputDirectory"): PDF_SummaryDir = arg #Check if the directory PDF_summary exists or not if not os.path.exists(PDF_SummaryDir): os.makedirs(PDF_SummaryDir) print('[+] Directorio creado') #Set parameters languages = ['spanish', 'english'] print('Seleccionar lenguaje') LANGUAGE = languages[cutie.select(languages)] print('[+] Lenguaje seleccionado') SENTENCES_COUNT = 30 algoritmos = ['Luhn', 'Lsa', 'LexRank', 'TextRank', 'SumBasic', 'KLsum'] print('Seleccionar algoritmo') chooseAlgo = algoritmos[cutie.select(algoritmos)] #create directories for output files outputPDFDir = os.path.dirname(PDF_SummaryDir + '/pdf/pdf_split_files/') if not os.path.exists(outputPDFDir): os.makedirs(PDF_SummaryDir + '/pdf/pdf_split_files/') outputTXTDir = os.path.dirname(PDF_SummaryDir + '/Text_Files/') if not os.path.exists(outputTXTDir): os.makedirs(PDF_SummaryDir + '/Text_Files/') outputSummaryDir = os.path.dirname(PDF_SummaryDir + '/Summary/') if not os.path.exists(outputSummaryDir): os.makedirs(PDF_SummaryDir + '/Summary/') #Name prefix for split files outputNamePrefix = 'Split_Chapter_' timeSuffixSummary = str(time.strftime("%d-%m-%Y_%H.%M.%S")) targetPDFFile = 'temppdfsplitfile.pdf' # Temporary file # Append backslash to output dir ofor pdf if necessary if not outputPDFDir.endswith('/'): outputPDFDir = outputPDFDir + '/' # Append backslash to output dir for txt if necessary if not outputTXTDir.endswith('/'): outputTXTDir = outputTXTDir + '/' # Append backslash to output dir ofor pdf if necessary if not outputSummaryDir.endswith('/'): outputSummaryDir = outputSummaryDir + '/' #Check and Verify if PDF is ready for splitting while not os.path.exists(sourcePDFFile): print('Source PDF not found, sleeping...') #Sleep time.sleep(10) if os.path.exists(sourcePDFFile): #print('Found source PDF file') #Copy file to local working directory shutil.copy(sourcePDFFile, targetPDFFile) #Process file #Create object and Open File in Read Binary Mode pdfFileObj2 = open(targetPDFFile, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj2) pdfFileObj = BookmarkToPageMap(pdfFileObj2) #Get total pages numberOfPages = pdfReader.numPages i = 0 newPageNum = 0 prevPageNum = 0 newPageName = '' prevPageName = '' for p, t in sorted([ (v, k) for k, v in pdfFileObj.getDestinationPageNumbers().items() ]): template = '%-5s %s' # To Check Page number and Title of the Chapter Uncomment the following lines ## print (template % ('Page', 'Title')) ## print (template % (p+1,t)) newPageNum = p + 1 newPageName = t if prevPageNum == 0 and prevPageName == '': # First Page prevPageNum = newPageNum prevPageName = newPageName else: # Next Page pdfWriter = PyPDF2.PdfFileWriter() page_idx = 0 for i in range(prevPageNum, newPageNum): pdfPage = pdfReader.getPage(i - 1) pdfWriter.insertPage(pdfPage, page_idx) # Check : print('Added page to PDF file: ' + prevPageName + ' - Page #: ' + str(i)) page_idx += 1 # Creating names of split files pdfFileName = str(outputNamePrefix + prevPageName + '.pdf').replace(':', '_').replace('*', '_') txtFileName = str(outputNamePrefix + prevPageName + '.txt').replace(':', '_').replace('*', '_') # Writing each chapter to the .pdf file pdfOutputFile = open(outputPDFDir + pdfFileName, 'wb') pdfWriter.write(pdfOutputFile) pdfOutputFile.close() # Check : print('Created PDF file: ' + outputPDFDir + pdfFileName) # Calling convert function and writing each chapter to the .txt file txtOutputFile = open(outputTXTDir + txtFileName, 'w') txtOutputFile.write(convert(outputPDFDir + pdfFileName)) txtOutputFile.close() # Check :print('Created TXT file: ' + outputTXTDir + txtFileName) # for plain text files create Summary parser = PlaintextParser.from_file(outputTXTDir + txtFileName, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) # Using LsaSummarizer to create summary ## Select from different algorithms to create summary by using different algorithms if chooseAlgo == 'Lsa': summarizer = Lsa(stemmer) elif chooseAlgo == 'LexRank': summarizer = LexRank(stemmer) elif chooseAlgo == 'TextRank': summarizer = TextRank(stemmer) elif chooseAlgo == 'Luhn': summarizer = Luhn(stemmer) elif chooseAlgo == 'SumBasic': summarizer = SumBasic(stemmer) elif chooseAlgo == 'KLsum': summarizer = KLsum(stemmer) else: print('Wrong Algorithm selected.') sys.exit(0) summarizer.stop_words = get_stop_words(LANGUAGE) # Open file in append mode so that summary will be added at the bottom of file summaryOutputFile = open( outputSummaryDir + chooseAlgo + '_Summary_File' + timeSuffixSummary + '.txt', 'a') for sentence in summarizer(parser.document, SENTENCES_COUNT): # Check : print (sentence) summaryOutputFile.write(str(sentence)) # To create Separation between Chapters summaryOutputFile.write( str('\n\n' + 'Title : ' + t + '\n' + '\t')) summaryOutputFile.close() i = prevPageNum prevPageNum = newPageNum prevPageName = newPageName # Split the last page pdfWriter = PyPDF2.PdfFileWriter() page_idx = 0 for i in range(prevPageNum, numberOfPages + 1): pdfPage = pdfReader.getPage(i - 1) pdfWriter.insertPage(pdfPage, page_idx) # Check : print('Added page to PDF file: ' + prevPageName + ' - Page #: ' + str(i)) page_idx += 1 pdfFileObj2.close() print('[+] Archivo creado: ' + outputSummaryDir + 'SummaryFile.txt') # Delete temp file os.unlink(targetPDFFile)