Exemplo n.º 1
0
def get_info(path):                                          #function to get number of pages of input pdf
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        print(pdf)
        number_of_pages = pdf.getNumPages()
        text = convert_pdf_to_txt(path, pages=range(0,number_of_pages))        
        LANGUAGE = "english"
        SENTENCES_COUNT = 10
        #url = "https://en.wikipedia.org/wiki/Automatic_summarization"
        #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        # or for plain strings
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) #PlaintextParser.from_file for files
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        
        print("SUMMARY")
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            print(sentence)

        #print(keywords.keywords(text))                  #original keywords
        keywordlist1=keywords.keywords(text,split=True)  #list of keywords
        keywordstring=" ".join(keywordlist1)             #string of keywords
        tokens= nltk.word_tokenize(keywordstring)

        print("\n","KEYWORDS")
        print("\n",{stem(t) for t in tokens},"\n")            # gives set of unique kywords    
def get_info(path):
    with open(path, 'r') as f:
        text = f.read()
        LANGUAGE = "english"
        SENTENCES_COUNT = 10
        #url = "https://en.wikipedia.org/wiki/Automatic_summarization"
        #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        # or for plain strings
        parser = PlaintextParser.from_string(
            text, Tokenizer(LANGUAGE))  #PlaintextParser.from_file for files
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        print("SUMMARY")
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            print(sentence)

        #print(keywords.keywords(text))                  #original keywords
        keywordlist1 = keywords.keywords(text, split=True)  #list of keywords
        keywordstring = " ".join(keywordlist1)  #string of keywords
        tokens = nltk.word_tokenize(keywordstring)

        print("\n", "KEYWORDS")
        print("\n", {stem(t)
                     for t in tokens}, "\n")  # gives set of unique kywords
Exemplo n.º 3
0
def create_lsa_summary():
    for filename in os.listdir(TREES_SOURCE_PATH):
        if filename.endswith('.xml'):
            name = re.search('topic_(.*)\.xml', filename)
            input_path = create_input(name.group(1))
            with open(SUM_PATH + name.group(1) + '_LSA_Group5.txt',
                      'w',
                      encoding='utf8') as summary:
                summary.write("====================== General Summary of " +
                              name.group(1) + " ======================\n")
                parser = PlaintextParser.from_file(input_path,
                                                   Tokenizer("english"))
                summarizer = LsaSummarizer()

                s = summarizer(parser.document, 4)

                for sentence in s:
                    summary.write(str(sentence) + '\n')
Exemplo n.º 4
0
def create_textrank_summary(max_chars):
    for filename in os.listdir(TREES_SOURCE_PATH):
        chars = 0
        if filename.endswith('.xml'):
            name = re.search('topic_(.*)\.xml', filename)
            input_path = create_input(name.group(1))
            path = SUM_PATH + name.group(1)
            with open(path + '_TextRank_Group5.txt', 'w',
                      encoding='utf8') as summary:
                summary.write("====================== General Summary of " +
                              name.group(1) + " ======================\n")
                parser = PlaintextParser.from_file(input_path,
                                                   Tokenizer("english"))
                summarizer = TextRankSummarizer()

                s = summarizer(parser.document, 3)

                for sentence in s:
                    if (chars + len(str(sentence)) <= max_chars):
                        summary.write(str(sentence) + '\n')
                        chars += len(str(sentence))

            expand_textrank(name.group(1), path, max_chars, chars)