예제 #1
0
def get_uni_context(pdfs, n):

    content = extract_pdf_content(pdfs[n])
    #找到content中所有的单词,考虑's以及-的情况
    content = re.sub(r'\-\n+', '', content)
    content = re.sub(r'\n+', ' ', content)
    #print(content)
    words_list = re.findall(r"[a-zA-Z|'\'''\-'']+", content)
    #print(type(words_list))
    #print(len(words_list))

    #获得所有学校的下标
    uni_index = [
        idx for idx, value in enumerate(words_list)
        if word_equal(value, 'University') or word_equal(value, 'Institute')
    ]
    #获得所有学校的上下文信息
    #print(uni_index) #输出“Univeristy”关键词在文章中的位置
    uni_context = {}
    for i in range(len(uni_index)):
        if uni_index[i] - 15 > 0:
            uni_context[i] = [
                item.lower()
                for item in words_list[uni_index[i] - 15:uni_index[i] + 15]
            ]
        else:
            uni_context[i] = [
                item.lower() for item in words_list[0:uni_index[i] + 15]
            ]

    return uni_context
예제 #2
0
def read_problems():
    content = extract_pdf_content('pdf/tc.pdf', 6)
    content = re.sub('\n+\d+\n+', '', content)
    text = content.replace('GRE填空机经1200题',
                           '').replace('\n' * 3, '\n').replace('\n' * 2, '\n')
    print(text)
    sections, _ = rule.find_all('section\s*\d+\s*((easy)|(medium)|(hard)|\s*)',
                                text)
    return sections
def get_mydict_from_pdf_path(mydict_, pdf_path_):
    """
    整合pdf内容提取到字典的模块
    输入是已有词典和pdf文件夹路径,输出为新的词典
    """
    pdfs = glob.glob("{}/*.pdf".format(pdf_path_))
    for pdf in pdfs:
        key = pdf.split('/')[-1]
        if key not in mydict_:
            print("Extracting content from {} ...".format(pdf))
            mydict_[key] = extract_pdf_content(pdf)
    return mydict_
예제 #4
0
def read_answers():
    content = extract_pdf_content('pdf/answers.pdf', 0)
    text = content.replace('GRE填空机经1200题', '').replace('\n' * 3, '\n').replace(
        '\n' * 2, '\n').replace('\n\d*\n', '')
    sections, _ = rule.find_all('\nSection\s*\d+', text)
    answers = []
    for section in sections:
        section = section.replace('Section', '')
        section_answers = rule.find_all_words('[A-Z]{1,3}', section)

        first_five = section_answers[5:]
        second_five = section_answers[:5]

        answers = [*answers, *first_five, *second_five]
    return answers
예제 #5
0
def save_all_sections():
    content = extract_pdf_content('pdf/tc.pdf', 6, False)
    text = content.replace('GRE填空机经1200题', '').replace('\n' * 3, '\n').replace(
        '\n' * 2, '\n').replace('\n\d*\n', '')
    # text = unicodedata.normalize("NFKC", text)
    m = re.finditer('section\s*\d+\s*((easy)|(median)|(hard)|\s*)', text)
    sections = []
    for s in m:
        item = dict()
        start = s.start()
        end = s.end()
        stext = text[start:end]
        sstext = stext.replace('section', '').strip()
        section_text = sstext.split(' ')
        if len(section_text) > 0:
            item['id'] = int(section_text[0])
            item['level'] = 'unknown'
            if len(section_text) == 2:
                item['level'] = section_text[1]
        sections.append(item)
    up.save_sections(sections)
예제 #6
0
https://zhuanlan.zhihu.com/p/34819237
https://github.com/wshuyi/demo-pdf-content-extract-batch-python-pdfminer
分析流程整理成函数,以便于将来更方便地调用
"""
import glob
from pdf_extractor import extract_pdf_content
import pandas as pd
import matplotlib.pyplot as plt

# 获得所有 pdf 文件的路径
pdf_path = "pdf/"
pdfs = glob.glob("{}/*.pdf".format(pdf_path))
print(pdfs)

# 从 pdf 文件列表中的第一篇里,抽取内容
content = extract_pdf_content(pdfs[0])
print(content)

# 建立辞典,批量抽取和存储内容
# 遍历 `pdfs` 列表,把文件名称(不包含目录)作为键值
mydict = {}
for pdf in pdfs:
    key = pdf.split('/')[-1]
    if key not in mydict:
        print("Extracting content from {} ...".format(
            pdf))  # 为了让这个过程更为清晰,我们让Python输出正在抽取的 pdf 文件名
        mydict[key] = extract_pdf_content(pdf)
print(mydict.keys())

# 字典变成数据框,以利于分析
# 注意后面的`reset_index()`把原先字典键值生成的索引也转换成了普通的列
예제 #7
0
# -*- coding: UTF-8 -*-
import glob
import os
from pdf_extractor import extract_pdf_content
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#指定pdf路径
pdf_path = "/root/dataset/shi_test1/"
#获得所有pdf路径
pdfs = glob.glob("{}/*.pdf".format(pdf_path))

print(pdfs)
#把从第一篇pdf中抽取的文本内容保存在content变量里
content = extract_pdf_content(pdfs[0])
#将文本内容转化为单词列表
words_list = re.findall(r"[a-zA-Z]\w+", content)
#不想要其出现的符号
#punctuations = ['(',')',',',';',':','[',']']
stop_words = stopwords.words('the', 'a', 'an', 'and')
keywords = [word for word in words_list if word in stop_words]
#print(words_list)
print(keywords)
#print(len(words_list))