def get_uni_context(pdfs, n): content = extract_pdf_content(pdfs[n]) #找到content中所有的单词,考虑's以及-的情况 content = re.sub(r'\-\n+', '', content) content = re.sub(r'\n+', ' ', content) #print(content) words_list = re.findall(r"[a-zA-Z|'\'''\-'']+", content) #print(type(words_list)) #print(len(words_list)) #获得所有学校的下标 uni_index = [ idx for idx, value in enumerate(words_list) if word_equal(value, 'University') or word_equal(value, 'Institute') ] #获得所有学校的上下文信息 #print(uni_index) #输出“Univeristy”关键词在文章中的位置 uni_context = {} for i in range(len(uni_index)): if uni_index[i] - 15 > 0: uni_context[i] = [ item.lower() for item in words_list[uni_index[i] - 15:uni_index[i] + 15] ] else: uni_context[i] = [ item.lower() for item in words_list[0:uni_index[i] + 15] ] return uni_context
def read_problems(): content = extract_pdf_content('pdf/tc.pdf', 6) content = re.sub('\n+\d+\n+', '', content) text = content.replace('GRE填空机经1200题', '').replace('\n' * 3, '\n').replace('\n' * 2, '\n') print(text) sections, _ = rule.find_all('section\s*\d+\s*((easy)|(medium)|(hard)|\s*)', text) return sections
def get_mydict_from_pdf_path(mydict_, pdf_path_): """ 整合pdf内容提取到字典的模块 输入是已有词典和pdf文件夹路径,输出为新的词典 """ pdfs = glob.glob("{}/*.pdf".format(pdf_path_)) for pdf in pdfs: key = pdf.split('/')[-1] if key not in mydict_: print("Extracting content from {} ...".format(pdf)) mydict_[key] = extract_pdf_content(pdf) return mydict_
def read_answers(): content = extract_pdf_content('pdf/answers.pdf', 0) text = content.replace('GRE填空机经1200题', '').replace('\n' * 3, '\n').replace( '\n' * 2, '\n').replace('\n\d*\n', '') sections, _ = rule.find_all('\nSection\s*\d+', text) answers = [] for section in sections: section = section.replace('Section', '') section_answers = rule.find_all_words('[A-Z]{1,3}', section) first_five = section_answers[5:] second_five = section_answers[:5] answers = [*answers, *first_five, *second_five] return answers
def save_all_sections(): content = extract_pdf_content('pdf/tc.pdf', 6, False) text = content.replace('GRE填空机经1200题', '').replace('\n' * 3, '\n').replace( '\n' * 2, '\n').replace('\n\d*\n', '') # text = unicodedata.normalize("NFKC", text) m = re.finditer('section\s*\d+\s*((easy)|(median)|(hard)|\s*)', text) sections = [] for s in m: item = dict() start = s.start() end = s.end() stext = text[start:end] sstext = stext.replace('section', '').strip() section_text = sstext.split(' ') if len(section_text) > 0: item['id'] = int(section_text[0]) item['level'] = 'unknown' if len(section_text) == 2: item['level'] = section_text[1] sections.append(item) up.save_sections(sections)
https://zhuanlan.zhihu.com/p/34819237 https://github.com/wshuyi/demo-pdf-content-extract-batch-python-pdfminer 分析流程整理成函数,以便于将来更方便地调用 """ import glob from pdf_extractor import extract_pdf_content import pandas as pd import matplotlib.pyplot as plt # 获得所有 pdf 文件的路径 pdf_path = "pdf/" pdfs = glob.glob("{}/*.pdf".format(pdf_path)) print(pdfs) # 从 pdf 文件列表中的第一篇里,抽取内容 content = extract_pdf_content(pdfs[0]) print(content) # 建立辞典,批量抽取和存储内容 # 遍历 `pdfs` 列表,把文件名称(不包含目录)作为键值 mydict = {} for pdf in pdfs: key = pdf.split('/')[-1] if key not in mydict: print("Extracting content from {} ...".format( pdf)) # 为了让这个过程更为清晰,我们让Python输出正在抽取的 pdf 文件名 mydict[key] = extract_pdf_content(pdf) print(mydict.keys()) # 字典变成数据框,以利于分析 # 注意后面的`reset_index()`把原先字典键值生成的索引也转换成了普通的列
# -*- coding: UTF-8 -*- import glob import os from pdf_extractor import extract_pdf_content import re from nltk.tokenize import word_tokenize from nltk.corpus import stopwords #指定pdf路径 pdf_path = "/root/dataset/shi_test1/" #获得所有pdf路径 pdfs = glob.glob("{}/*.pdf".format(pdf_path)) print(pdfs) #把从第一篇pdf中抽取的文本内容保存在content变量里 content = extract_pdf_content(pdfs[0]) #将文本内容转化为单词列表 words_list = re.findall(r"[a-zA-Z]\w+", content) #不想要其出现的符号 #punctuations = ['(',')',',',';',':','[',']'] stop_words = stopwords.words('the', 'a', 'an', 'and') keywords = [word for word in words_list if word in stop_words] #print(words_list) print(keywords) #print(len(words_list))