Exemplo n.º 1
0
def __init__():
    LANGUAGE = "english"
    SENTENCES_COUNT = 1


    stemmer = Stemmer(LANGUAGE)

    lsaSummarizer = Lsa(stemmer)
    lsaSummarizer.stop_words = get_stop_words(LANGUAGE)
    luhnSummarizer = Luhn(stemmer)
    luhnSummarizer.stop_words = get_stop_words(LANGUAGE)
    # edmundsonSummarizer.bonus_words = get_bonus_words

    lexrankSummarizer = LexRank(stemmer)
    lexrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    textrankSummarizer = TxtRank(stemmer)
    textrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    sumbasicSummarizer = SumBasic(stemmer)
    sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE)


    klSummarizer = KL(stemmer)
    klSummarizer.stop_words = get_stop_words(LANGUAGE)

    parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE))

    allvariations = []

    for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT):
       # print("Summarizing text via LSA: ")
        print((str(sentence)))


        allvariations.append(sentence)
    for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Luhn: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Lexrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Textrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Sumbasic: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in klSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via klSum: ")
        print(str(sentence))
        allvariations.append(sentence)
        return allvariations
Exemplo n.º 2
0
    def summarize(content, language):
        parser = PlaintextParser.from_string(content, Tokenizer(language))
        lsa = Lsa()

        summary = lsa(parser.document, "30%")

        text = ""
        for sentence in summary:
            text = str(text) + str(sentence)

        return text
Exemplo n.º 3
0
            #   Check : print('Created PDF file: ' + outputPDFDir + pdfFileName)

            #   Calling convert function and writing each chapter to the .txt file
            txtOutputFile = open(outputTXTDir + txtFileName, 'w')
            txtOutputFile.write(convert(outputPDFDir + pdfFileName))
            txtOutputFile.close()
            #   Check :print('Created TXT file: ' + outputTXTDir + txtFileName)

            #   for plain text files create Summary
            parser = PlaintextParser.from_file(outputTXTDir + txtFileName,
                                               Tokenizer(LANGUAGE))
            stemmer = Stemmer(LANGUAGE)
            #   Using LsaSummarizer to create summary
            ##  Select from different algorithms to create summary by using different algorithms
            if chooseAlgo == 'Lsa':
                summarizer = Lsa(stemmer)
            elif chooseAlgo == 'LexRank':
                summarizer = LexRank(stemmer)
            elif chooseAlgo == 'TextRank':
                summarizer = TextRank(stemmer)
            elif chooseAlgo == 'Luhn':
                summarizer = Luhn(stemmer)
            elif chooseAlgo == 'SumBasic':
                summarizer = SumBasic(stemmer)
            elif chooseAlgo == 'KLsum':
                summarizer = KLsum(stemmer)
            else:
                print('Wrong Algorithm selected.')
                sys.exit(0)

            summarizer.stop_words = get_stop_words(LANGUAGE)
Exemplo n.º 4
0
                    if len(sentences) > 35:
                        # trim off super short - likely a few word sentences
                        ref_sentences.append(sentences)
                except TypeError:
                    #catch type errors caused by annotated text ie h1, b, etc
                    print("typeError")
                    continue
    trim_ref_sentences.extend(
        Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences)

    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    #define summarizers for the summarizing methods being used
    summarizer_Lsa = Lsa(stemmer)
    summarizer_Lsa.stop_words = get_stop_words(LANGUAGE)
    summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT)

    summarizer_LexRank = LexRank()
    summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT)

    summarizer_Edmundson = Edmundson(stemmer)
    summarizer_Edmundson.null_words = get_stop_words(LANGUAGE)
    summarizer_Edmundson.bonus_words = parser.significant_words
    summarizer_Edmundson.stigma_words = parser.stigma_words
    summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT)

    #store summaries in a text  file
    #list_of_sums = [summary_Lsa, summary_LexRank, summary_Edmundson]
    #f = open('summarized.txt', 'w')
Exemplo n.º 5
0
def summary(article_url):
    url = article_url
    #url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/potato"
    # url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/cabbage"
    # url = "http://www.encyclopedia.com/medicine/diseases-and-conditions/pathology/accident"
    # url = "http://www.encyclopedia.com/earth-and-environment/atmosphere-and-weather/atmospheric-and-space-sciences-atmosphere/air"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    # create a list of reference sentences to calculate ROUGE_N scores
    ref_sentences = []
    trim_ref_sentences = []
    for paragraph in parser._article.main_text:
        for sections in paragraph:
            for sentences in sections:
                try:
                    if len(sentences) > 35:
                        # trim off super short - likely a few word sentences
                        ref_sentences.append(sentences)
                except TypeError:
                    # catch type errors caused by annotated text ie h1, b, etc
                    print("typeError")
                    continue
    trim_ref_sentences.extend(
        Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences)

    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    # define summarizers for the summarizing methods being used
    summarizer_Lsa = Lsa(stemmer)
    summarizer_Lsa.stop_words = get_stop_words(LANGUAGE)
    summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT)

    summarizer_LexRank = LexRank()
    summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT)

    summarizer_Edmundson = Edmundson(stemmer)
    summarizer_Edmundson.null_words = get_stop_words(LANGUAGE)
    summarizer_Edmundson.bonus_words = parser.significant_words
    summarizer_Edmundson.stigma_words = parser.stigma_words
    summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT)

    # print summaries
    summary_Lsa_trim = []
    for sentence in summary_Lsa:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_Lsa_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_Lsa_trim, trim_ref_sentences)

    print('\n')
    summary_LexRank_trim = []
    for sentence in summary_LexRank:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_LexRank_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_LexRank_trim, trim_ref_sentences)

    print('\n')
    summary_Edmundson_trim = []
    for sentence in summary_Edmundson:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_Edmundson_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_Edmundson_trim, trim_ref_sentences)

    # returns index of max 0=Ed, 1=Lsa, 2=Lex
    models = {0: "Edmundson Model", 1: "Lsa Model", 2: "LexRank Model"}
    best_summary = max_r_value(summary_Lsa_trim, summary_LexRank_trim,
                               summary_Edmundson_trim, trim_ref_sentences)
    print(
        models.get(best_summary) +
        ' is the best model according to an average of the Rouge_3, 2 and 1 tests'
    )

    #return the summary of the best model
    if (best_summary == 0):
        return summary_Edmundson_trim
    elif (best_summary == 1):
        return summary_Lsa_trim
    elif (best_summary == 2):
        return summary_LexRank_trim
Exemplo n.º 6
0
def main(argv):
    # Leer parametros archivo entrada y directorio salida
    try:
        opts, args = getopt.getopt(argv, "i:o:",
                                   ["inputFile=", "outputDirectory="])
    except getopt.GetoptError:
        print('main.py -i <inputFile> -o <outputDirectory>')
        sys.exit(2)

    if (len(opts) != 2):
        print('main.py -i <inputFile> -o <outputDirectory>')
        sys.exit(2)

    PDF_SummaryDir = ''
    sourcePDFFile = ''

    for opt, arg in opts:
        if opt == '-h':
            print('main.py -i <inputFile> -o <outputDirectory>')
            sys.exit()
        elif opt in ("-i", "--inputFile"):
            sourcePDFFile = arg
            if os.path.exists(sourcePDFFile):
                print('[+] Archivo PDF encontrado')
        elif opt in ("-o", "--outputDirectory"):
            PDF_SummaryDir = arg
            #Check if the directory PDF_summary exists or not
            if not os.path.exists(PDF_SummaryDir):
                os.makedirs(PDF_SummaryDir)
                print('[+] Directorio creado')

    #Set parameters
    languages = ['spanish', 'english']
    print('Seleccionar lenguaje')
    LANGUAGE = languages[cutie.select(languages)]
    print('[+] Lenguaje seleccionado')
    SENTENCES_COUNT = 30

    algoritmos = ['Luhn', 'Lsa', 'LexRank', 'TextRank', 'SumBasic', 'KLsum']
    print('Seleccionar algoritmo')
    chooseAlgo = algoritmos[cutie.select(algoritmos)]

    #create directories for output files
    outputPDFDir = os.path.dirname(PDF_SummaryDir + '/pdf/pdf_split_files/')
    if not os.path.exists(outputPDFDir):
        os.makedirs(PDF_SummaryDir + '/pdf/pdf_split_files/')

    outputTXTDir = os.path.dirname(PDF_SummaryDir + '/Text_Files/')
    if not os.path.exists(outputTXTDir):
        os.makedirs(PDF_SummaryDir + '/Text_Files/')

    outputSummaryDir = os.path.dirname(PDF_SummaryDir + '/Summary/')
    if not os.path.exists(outputSummaryDir):
        os.makedirs(PDF_SummaryDir + '/Summary/')

    #Name prefix for split files
    outputNamePrefix = 'Split_Chapter_'
    timeSuffixSummary = str(time.strftime("%d-%m-%Y_%H.%M.%S"))
    targetPDFFile = 'temppdfsplitfile.pdf'  # Temporary file

    # Append backslash to output dir ofor pdf if necessary
    if not outputPDFDir.endswith('/'):
        outputPDFDir = outputPDFDir + '/'

    # Append backslash to output dir for txt if necessary
    if not outputTXTDir.endswith('/'):
        outputTXTDir = outputTXTDir + '/'

    # Append backslash to output dir ofor pdf if necessary
    if not outputSummaryDir.endswith('/'):
        outputSummaryDir = outputSummaryDir + '/'

    #Check and Verify if PDF is ready for splitting
    while not os.path.exists(sourcePDFFile):
        print('Source PDF not found, sleeping...')
        #Sleep
        time.sleep(10)

    if os.path.exists(sourcePDFFile):
        #print('Found source PDF file')
        #Copy file to local working directory
        shutil.copy(sourcePDFFile, targetPDFFile)

        #Process file
        #Create object and Open File in Read Binary Mode
        pdfFileObj2 = open(targetPDFFile, 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj2)
        pdfFileObj = BookmarkToPageMap(pdfFileObj2)

        #Get total pages
        numberOfPages = pdfReader.numPages

        i = 0
        newPageNum = 0
        prevPageNum = 0
        newPageName = ''
        prevPageName = ''

        for p, t in sorted([
            (v, k) for k, v in pdfFileObj.getDestinationPageNumbers().items()
        ]):
            template = '%-5s  %s'
            #   To Check Page number and Title of the Chapter Uncomment the following lines
            ##  print (template % ('Page', 'Title'))
            ##  print (template % (p+1,t))

            newPageNum = p + 1
            newPageName = t

            if prevPageNum == 0 and prevPageName == '':
                #  First Page
                prevPageNum = newPageNum
                prevPageName = newPageName
            else:
                # Next Page
                pdfWriter = PyPDF2.PdfFileWriter()
                page_idx = 0
                for i in range(prevPageNum, newPageNum):
                    pdfPage = pdfReader.getPage(i - 1)
                    pdfWriter.insertPage(pdfPage, page_idx)
                    #   Check : print('Added page to PDF file: ' + prevPageName + ' - Page #: ' + str(i))
                    page_idx += 1

            #   Creating names of split files
                pdfFileName = str(outputNamePrefix + prevPageName +
                                  '.pdf').replace(':', '_').replace('*', '_')
                txtFileName = str(outputNamePrefix + prevPageName +
                                  '.txt').replace(':', '_').replace('*', '_')

                #   Writing each chapter to the .pdf file
                pdfOutputFile = open(outputPDFDir + pdfFileName, 'wb')
                pdfWriter.write(pdfOutputFile)
                pdfOutputFile.close()

                #   Check : print('Created PDF file: ' + outputPDFDir + pdfFileName)

                #   Calling convert function and writing each chapter to the .txt file
                txtOutputFile = open(outputTXTDir + txtFileName, 'w')
                txtOutputFile.write(convert(outputPDFDir + pdfFileName))
                txtOutputFile.close()
                #   Check :print('Created TXT file: ' + outputTXTDir + txtFileName)

                #   for plain text files create Summary
                parser = PlaintextParser.from_file(outputTXTDir + txtFileName,
                                                   Tokenizer(LANGUAGE))
                stemmer = Stemmer(LANGUAGE)
                #   Using LsaSummarizer to create summary
                ##  Select from different algorithms to create summary by using different algorithms
                if chooseAlgo == 'Lsa':
                    summarizer = Lsa(stemmer)
                elif chooseAlgo == 'LexRank':
                    summarizer = LexRank(stemmer)
                elif chooseAlgo == 'TextRank':
                    summarizer = TextRank(stemmer)
                elif chooseAlgo == 'Luhn':
                    summarizer = Luhn(stemmer)
                elif chooseAlgo == 'SumBasic':
                    summarizer = SumBasic(stemmer)
                elif chooseAlgo == 'KLsum':
                    summarizer = KLsum(stemmer)
                else:
                    print('Wrong Algorithm selected.')
                    sys.exit(0)

                summarizer.stop_words = get_stop_words(LANGUAGE)
                #   Open file in append mode so that summary will be added at the bottom of file
                summaryOutputFile = open(
                    outputSummaryDir + chooseAlgo + '_Summary_File' +
                    timeSuffixSummary + '.txt', 'a')
                for sentence in summarizer(parser.document, SENTENCES_COUNT):
                    #   Check : print (sentence)
                    summaryOutputFile.write(str(sentence))

            #   To create Separation between Chapters
                summaryOutputFile.write(
                    str('\n\n' + 'Title : ' + t + '\n' + '\t'))
                summaryOutputFile.close()

            i = prevPageNum
            prevPageNum = newPageNum
            prevPageName = newPageName

        # Split the last page
        pdfWriter = PyPDF2.PdfFileWriter()
        page_idx = 0
        for i in range(prevPageNum, numberOfPages + 1):
            pdfPage = pdfReader.getPage(i - 1)
            pdfWriter.insertPage(pdfPage, page_idx)
            #   Check : print('Added page to PDF file: ' + prevPageName + ' - Page #: ' + str(i))
            page_idx += 1

        pdfFileObj2.close()
        print('[+] Archivo creado: ' + outputSummaryDir + 'SummaryFile.txt')

    # Delete temp file
    os.unlink(targetPDFFile)