def parse(pdf_path): global eps # 保存文本内容 key = pdf_path.split('/')[-1] print('extracting from ', key) fp = open(pdf_path, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 try: doc.initialize() except PDFEncryptionError: return # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: return # raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量 num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0 # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 num_page += 1 # 页面增一 pre_sent = {'text': '', 'height': 0, 'left': 0, 'width': 0} post_sent = {'text': '', 'height': 0, 'left': 0, 'width': 0} pre_flag = False post_flag = False try: interpreter.process_page(page) except KeyError: continue except AssertionError: continue except OSError: continue f = open(key[:-4] + '.txt', 'a', encoding='utf-8') f.write('\n\n') f.close() # 接受该页面的LTPage对象 layout = device.get_result() text_dic_list = [] #建立空字典链表,其值为宽度相同的字符串的拼接 for x in layout: if isinstance(x, LTImage): # 图片对象 num_image += 1 if isinstance(x, LTCurve): # 曲线对象 num_curve += 1 if isinstance(x, LTFigure): # figure对象 num_figure += 11 if isinstance(x, LTTextBoxHorizontal): # 获取文本内容 num_TextBoxHorizontal += 1 # 水平文本框对象增一 # results = x.get_text() results = "" for i in x._objs: for j in i._objs: temch = j._text[0] for w in range(1, len(j._text)): if is_chinese(j._text[w]): ch = j._text[w] break results += temch height = x._avg_lineheight for gethei in range(len(x._objs[0]._objs)): if is_chinese(x._objs[0]._objs[gethei]._text[0]): height = x._objs[0]._objs[gethei].height if match_pattern(results): #检测是否符合启发式规则 nresults = spe_pun_drop(results) inserted = False for item in text_dic_list: #主要清洗方式,为联合在pdf分行的同一个语句,将同一页中所有宽度相同的句子联合,并用空格分隔 if (abs(item['hide'] - (height)) < eps ) and abs(item['left'] - x.x0) < 5 * height: if pre_flag and abs( pre_sent['height'] - height) < eps and pre_sent[ 'width'] >= x.width - height * 5: nresults = pre_sent['text'] + nresults if (item['y0'] - x.y0) > 4 * height and nresults[ 0] != '\n': #平行段落之间添加换行符 nresults = '\n' + nresults item['text'] += (nresults) if x.x0 > item['left']: item['left'] = x.x0 if x.y0 < item["y0"]: item["y0"] = x.y0 inserted = True break if not inserted: if pre_flag and abs( pre_sent['height'] - height) < eps and pre_sent[ 'width'] >= x.width - height * 5: nresults = pre_sent['text'] + nresults text_dic_list.append({ 'hide': height, 'left': x.x0, 'width': x.width, 'text': nresults, "y0": x.y0 }) pre_flag = False post_flag = True else: fun_flag = False #处理pdf文段最后一句话不被添加至句子中 if post_flag == True: nresults = spe_pun_drop(results) for item in text_dic_list: if (abs(item['hide'] - (height)) < eps and abs(item['left'] - x.x0) < height * 5): item['text'] += (nresults) fun_flag = True break if fun_flag == False: #处理连续的末尾没有标点 post_flag = False if not fun_flag: if pre_flag and abs( pre_sent['height'] - height) < eps and abs( pre_sent['left'] - x.x0) < height * 5: #处理有多行之间没有标点等情况 pre_sent['text'] += spe_pun_drop(results) else: pre_sent['text'] = spe_pun_drop(results) pre_sent['height'] = height pre_sent['left'] = x.x0 pre_sent['width'] = x.width pre_flag = True for item in text_dic_list: f = open(key[:-4] + '.txt', 'a', encoding='utf-8') f.write(item['text'] + '\n') f.close() print('对象数量:\n', '页面数:%s\n' % num_page, '图片数:%s\n' % num_image, '曲线数:%s\n' % num_curve, '水平文本框:%s\n' % num_TextBoxHorizontal)
def parse_pdf(file_path, method='tika'): """ Given a PDF file complete path, the function parses the file, counts the number of pages and checks if it is text-extractable. Parameters ---------- file_path: string Complete path to output file. method: string Method used to extract the text: 'pdfminer', 'pypdf', 'tika'. Return ------ extracted_text: string Text extracted from the document. number_of_pages: Number of pages of the document. """ if method == 'pdfminer': with open(file_path, "rb") as fp: # Create parser object to parse the pdf content pdf_parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(pdf_parser) # Check if document is text-extractable or not is_extractable = document.is_extractable # Check if document is extractable, if not abort if not is_extractable: raise PDFTextExtractionNotAllowed # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # Set parameters for analysis laparams = LAParams() # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = "" number_of_pages = 0 # Process PDF document page by page for page in PDFPage.create_pages(document): number_of_pages = number_of_pages + 1 extracted_text += f"[Page {number_of_pages}]\n" # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): extracted_text += lt_obj.get_text() if method == 'pypdf': with open(file_path, 'rb') as f: pdf = PdfFileReader(f) number_of_pages = pdf.getNumPages() extracted_text = ''.join([ f'[Page {i}]\n' + pdf.getPage(i).extractText() for i in range(number_of_pages) ]) if method == 'tika': raw = parser.from_file(file_path) extracted_text = raw['content'] number_of_pages = int(raw['metadata']['xmpTPg:NPages']) else: logging.error(f'Text extractor method {method} not found') return extracted_text, number_of_pages
from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer filepath = 'C:/Users/lenovo/Desktop/ACL2020' list1 = os.listdir(filepath) list_words = [] for i in range(len(list1)): outs = "" fp = open(filepath + '/' + list1[i], 'rb') parser = PDFParser(fp) doc = PDFDocument(parser=parser) parser.set_document(doc=doc) resource = PDFResourceManager() laparam = LAParams() device = PDFPageAggregator(resource, laparams=laparam) interpreter = PDFPageInterpreter(resource, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) layout = device.get_result() for out in layout: if hasattr(out, 'get_text'): outs = out.get_text() + outs outs = outs.lower().replace('\n', '') english_pu = ['’', '“', '“'] punctuation_map = dict((ord(char), None) for char in string.punctuation) without_punctuation = outs.translate(punctuation_map) # 去除文章标点符号 raw_words = nltk.word_tokenize( without_punctuation) # 将文章进行分词处理,将一段话转变成一个list wordnet_lematizer = WordNetLemmatizer() words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words] # 去除停用词
def get_signatures_from_pdf(self, path, year=''): codec = 'utf-8' rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr=rsrcmgr, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) temp_pages = [] for page in pages: temp_pages.append(page) if not temp_pages: return first_page = temp_pages[0] interpreter.process_page(first_page) first_page_layout = device.get_result() regulations = self.get_document_info(first_page_layout) ignore_words = ['ΟI ΥΠΟΥΡΓΟI', 'ΤΑ ΜΕΛΗ', 'ΟΙ ΥΠΟΥΡΓΟΙ'] if not regulations: return signature_sets = [] # Start from the last page until all the required signature sets are found for page in reversed(temp_pages): # Get the page's layout interpreter.process_page(page) page_layout = device.get_result() # Split text to line's for easier parsing text_lines = self.text_from_layout_objects(page_layout).split("\n") # Boolean indicating whether we are currently in a signature set # Save the data found search_active = False persons = [] names = [] roles = [] role = "" temp_name = "" for line in text_lines: line = line.strip() if search_active: if self.is_break_point(line): for index, name in enumerate(names): current_role = roles[index] if index < len( roles) else "" persons.append({ 'name': name, 'role': Helper.format_role(current_role) }) # Continue searching at next point role = "" temp_name = "" search_active = False if persons: signature_sets.append(persons) persons = [] # Break if enough signature sets have been found. Otherwise we'll continue looking for # more in the same page. if len(signature_sets) == len(regulations): break normal_line = Helper.normalize_greek_name(line) if normal_line in ignore_words: continue if '***' in line and normal_line: if role: roles.append(role) role = "" names.append(normal_line) else: role += line elif (year in line and Helper.date_match(year).match(line)) \ or (str(int(year) - 1) in line and Helper.date_match(str(int(year) - 1)).match(line)) \ or line == 'Οι Υπουργοί': search_active = True # If the end of page has been reached we save the signatures if persons: signature_sets.append(persons) # When we find enough signature sets we stop parsing pages. if len(signature_sets) == len(regulations): break # Merge regulations and signature sets for index, signatures in enumerate(reversed(signature_sets)): if index >= len(regulations): return regulations[index]['signatures'] = signatures return regulations
"""open pdf file generate interpreter for each page->interpret it to text n save it create a txt file with saved data""" from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter from pdfminer.layout import LAParams import io pdfpath = 'Downloads\\1pe17cs032_finalresume.pdf' pdf = open(pdfpath, 'rb') mem = io.StringIO() rm = PDFResourceManager() lp = LAParams() cnv = TextConverter(rm, mem, laparams=lp) ip = PDFPageInterpreter(rm, cnv) for i in PDFPage.get_pages(pdf): ip.process_page(i) text = mem.getvalue() file = open("Downloads\\1pe17cs032_finalresume.txt", 'wb') file.write(text.encode('utf-8')) print("done")
def get_data(setting) -> list: ''' 初期化 ''' # Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。 laparams = LAParams() # 共有のリソースを管理するリソースマネージャーを作成。 resource_manager = PDFResourceManager() # ページを集めるPageAggregatorオブジェクトを作成。 device = PDFPageAggregator(resource_manager, laparams=laparams) # Interpreterオブジェクトを作成。 interpreter = PDFPageInterpreter(resource_manager, device) pdf_archive_dir = setup_pdf_archive_dir() # pdf data patient_datas_pdf = [] patient_datas_old = [] ret_data = [] ''' リスト取得 ''' befor_tb_avg = 10000 # (top + bottom)/2 # 分割されたPDFを1つずつ読み込み・処理 box_list = [] for pdf_url in setting.pdf_urls: # pdf取得 pdf_path = os.path.join(pdf_archive_dir, pdf_url.split('/')[-1]) print(pdf_url) with urllib.request.urlopen(pdf_url) as u: with open(pdf_path, 'bw') as o: o.write(u.read()) with open(pdf_path, 'rb') as f: for page in PDFPage.get_pages(f): interpreter.process_page(page) layout = device.get_result() # ページ内のテキストボックスのリストを取得する。 boxes = find_textboxes_recursively(layout) # テキストボックスの左上の座標の順でテキストボックスをソートする。 # y1(Y座標の値)は上に行くほど大きくなるので、正負を反転させている。 boxes.sort(key=lambda b: (-b.y1, b.x0)) for box in boxes: if is_skip(box.get_text()) is True: if box.get_text().find('#N/A') != -1: box_list = [] continue temp_tb_avg = (box.y1 + box.y0) / 2 if 15 < befor_tb_avg - temp_tb_avg or befor_tb_avg - temp_tb_avg < -15: box_list.sort(key=lambda b: (b.x0)) if len(box_list) == 0: befor_tb_avg = temp_tb_avg box_list = [] elif box_list[0].get_text().find( '-1') != -1 or box_list[0].get_text().find( '○') != -1 or box_list[0].get_text().find( '(cid:16089)1') != -1: befor_tb_avg = temp_tb_avg box_list = [] else: temp_pd = patient_data(box_list) temp_pd.parse_line() if temp_pd.is_error is False: patient_datas_pdf.append(temp_pd) befor_tb_avg = temp_tb_avg box_list = [] else: print('error') print(box_list) befor_tb_avg = temp_tb_avg box_list = [] befor_tb_avg = temp_tb_avg box_list.append(box) # 前のPDFファイルで残されたデータの処理 box_list.sort(key=lambda b: (b.x0)) temp_pd = patient_data(box_list) temp_pd.parse_line() if temp_pd.is_error is False: patient_datas_pdf.append(temp_pd) befor_tb_avg = temp_tb_avg box_list = [] else: print('error') print(box_list) befor_tb_avg = temp_tb_avg box_list = [] # 最後でデータを処理 if len(box_list) == 0: befor_tb_avg = temp_tb_avg box_list = [] else: temp_pd = patient_data(box_list) temp_pd.parse_line() if temp_pd.is_error is False: print(temp_pd.no) patient_datas_pdf.append(temp_pd) befor_tb_avg = temp_tb_avg box_list = [] else: print('error') print(box_list) befor_tb_avg = temp_tb_avg box_list = [] patient_datas_pdf.reverse() # 閲覧不可になったデータの処理 old_no_range = list(range(1, 12656)) row_datas = [] patient_datas_old = [] with open( os.path.dirname(os.path.abspath(__file__)) + "/data/row_data.json", "r") as f: row_datas = json.load(f) for row_data in row_datas: if int(row_data['No']) not in old_no_range: continue temp_patient_data = patient_data() temp_patient_data.no = row_data['No'] temp_patient_data.revealed_dt = dt.strptime(row_data['revealed_dt'], '%Y-%m-%d') temp_patient_data.old = row_data['old'] temp_patient_data.sex = row_data['sex'] temp_patient_data.job = row_data['job'] temp_patient_data.symptom = row_data['symptom'] if row_data['appearance_dt'] == '': temp_patient_data.appearance_dt = None else: temp_patient_data.appearance_dt = dt.strptime( row_data['appearance_dt'], '%Y-%m-%d') if row_data['status_id'] in [1, 2, 3, 4]: temp_patient_data.status_id = 7 else: temp_patient_data.status_id = row_data['status_id'] patient_datas_old.append(temp_patient_data) patient_datas = patient_datas_old + patient_datas_pdf patient_datas_sorted = sorted(patient_datas, key=lambda x: int(x.no)) for patient in patient_datas_sorted: ret_data.append(patient.export_dict()) return ret_data
def uploaded_file(filename): # Read fie pdf = pdfquery.PDFQuery('UPLOAD_FOLDER/pdf_temp.pdf') pdf.load() # Save xml tree pdf.tree.write('UPLOAD_FOLDER/test.xml', pretty_print=True) pq_items = pdf.pq('LTTextBoxVertical, LTTextLineHorizontal') items = pd.DataFrame( columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num']) for pq in pq_items: page_pq = next(pq.iterancestors('LTPage')) # Use just the first ancestor page_num = page_pq.layout.pageid cur_str_item = str(pq.layout) tmp_items = pd.DataFrame([[ get_name(cur_str_item), float(get_coordinates(cur_str_item)[0]), float(get_coordinates(cur_str_item)[2]), float(get_coordinates(cur_str_item)[1]), float(get_coordinates(cur_str_item)[3]) ]], columns=['name', 'x0', 'x1', 'y0', 'y1']) # tmp_items['height'] = tmp_items['y1'] - tmp_items['y0'] # tmp_items['width'] = tmp_items['x1'] - tmp_items['x0'] tmp_items['height'] = get_diff3(tmp_items['y1'], tmp_items['y0']) tmp_items['width'] = get_diff3(tmp_items['x1'], tmp_items['x0']) tmp_items['page_num'] = page_num items = items.append(tmp_items, ignore_index=True) # PDF converted to DF items = items.sort_values(['page_num', 'x0', 'y1'], ascending=[True, True, False]) items.reset_index(inplace=True, drop=True) #H destribution heights = pd.crosstab(index=items["height"], columns="count") heights = heights[heights['count'] > 1] cat_h = round3(max(heights[heights['count'] >= min_dish_count].index.values)) tmp = heights[heights['count'] >= min_dish_count].index.values item_h = round3(max(tmp[tmp < cat_h])) # Plot all boxes pdf_boundary_boxes( df=items, path_input='UPLOAD_FOLDER/pdf_temp.pdf', path_output='UPLOAD_FOLDER/temp.pdf', r=50, g=0, b=100) ######################## Get categoties #################################### cat_list = items[items['height'].between(0.99 * cat_h, 1.01 * cat_h)] cat_char_w = cat_list.apply(lambda row: mean_char(row['width'], row['name']), axis=1).median() cat_char_w_max = cat_list.apply(lambda row: mean_char(row['width'], row['name']), axis=1).max() #Collapse rows with cat cat_list = collapse_rows(cat_list, sense=1.03) cat_list = cat_list.sort_values(['page_num', 'y1', 'x0'], ascending=[True, False, True]) filter = cat_list["name"] != ' ' cat_list = cat_list[filter] cat_list = cat_list.reset_index(drop=True) #Draw categories boxes pdf_boundary_boxes(df=cat_list, path_input='UPLOAD_FOLDER/pdf_temp.pdf', show_height=False, show_number=True, path_output='UPLOAD_FOLDER/temp1.pdf') #################### Get items ############################################### items_list = items[items['height'].between(0.99 * item_h, 1.01 * item_h)] items_list = items_list.reset_index(drop=True) items_list = collapse_rows(items_list) # Delete empty items filter = items_list["name"] != ' ' items_list = items_list[filter] items_list = items_list.reset_index(drop=True) # Get dishes patternDel = "^[0-9 \. \/]+$" filter = items_list['name'].str.contains(patternDel) dishes_list = items_list[~filter] dishes_list = dishes_list.reset_index(drop=True) # Dishes to layout pdf_boundary_boxes( df=dishes_list, path_input="UPLOAD_FOLDER/temp1.pdf", path_output="UPLOAD_FOLDER/temp_dishes.pdf", show_height=False, r=0, g=0, b=230) # Get prices prices_list = items_list[~items_list.name.isin(dishes_list.name)] prices_list = prices_list.reset_index(drop=True) # Prices to layout pdf_boundary_boxes( df=prices_list, path_input="UPLOAD_FOLDER/temp_dishes.pdf", path_output="UPLOAD_FOLDER/temp_dishes_prices.pdf", show_height=False, r=230, g=0, b=0) ################################# Second algo ################################### fp = open('UPLOAD_FOLDER/pdf_temp.pdf', 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) #Show new strucure for page in pages: print('Processing next page...') interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text() #Get cat-s (Only for 1 page) cat_n = pd.DataFrame(columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num']) for lobj in layout: if isinstance(lobj, LTTextBox): x0, y1, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text().split("\n")[0] x1, y0 = lobj.bbox[2], lobj.bbox[1] tmp = cat_list[cat_list['y1'].between(0.97 * y1, 1.03 * y1)].copy() tmp['name'] = text if len(cat_list[cat_list['y1'].between(0.97 * y1, 1.03 * y1)]['name']) > 0: if (text != cat_list[cat_list['y1'].between(0.97 * y1, 1.03 * y1)]['name'].values[0]): tmp['x0'] = x0 cat_n = cat_n.append(tmp, ignore_index=True) #Re-draw new layout with cat-s pdf_boundary_boxes( df=cat_n, show_height=False, show_number=True, path_input="UPLOAD_FOLDER/temp_dishes_prices.pdf", path_output="UPLOAD_FOLDER/temp_cat_n.pdf", ) #Get prices pq_items1 = pdf.pq('LTTextLineVertical') items1 = pd.DataFrame( columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num']) for pq in pq_items1: page_pq = next(pq.iterancestors('LTPage')) # Use just the first ancestor page_num = page_pq.layout.pageid cur_str_item = str(pq.layout) tmp_items = pd.DataFrame([[ get_name(cur_str_item), float(get_coordinates(cur_str_item)[0]), float(get_coordinates(cur_str_item)[2]), float(get_coordinates(cur_str_item)[1]), float(get_coordinates(cur_str_item)[3]) ]], columns=['name', 'x0', 'x1', 'y0', 'y1']) # tmp_items['height'] = tmp_items['y1'] - tmp_items['y0'] # tmp_items['width'] = tmp_items['x1'] - tmp_items['x0'] tmp_items['height'] = get_diff3(tmp_items['y1'], tmp_items['y0']) tmp_items['width'] = get_diff3(tmp_items['x1'], tmp_items['x0']) tmp_items['page_num'] = page_num items1 = items1.append(tmp_items, ignore_index=True) items1 = items1.sort_values(['page_num', 'x0', 'y1'], ascending=[True, True, False]) items1.reset_index(inplace=True, drop=True) patternDel = '^ *\d[\d ]*$' filter = items1['name'].str.contains(patternDel) items1 = items1[filter] items1 = items1.reset_index(drop=True) prices_n = pd.DataFrame(columns=['name', 'x0', 'x1', 'y0', 'y1']) for i in range(0, len(items1)): big_prices = items1.iloc[i]['name'] height_a = items1.iloc[i]['height'] / len(big_prices) tmp_len = len(big_prices) for j in range(0, tmp_len): # tmp_prices_n = pd.DataFrame(columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num']) tmp_name = items1.iloc[i]['name'][j] y1_temp = items1.iloc[i]['y1'] - j * height_a y0_temp = items1.iloc[i]['y0'] + j * height_a x0, x1 = items1.iloc[i]['x0'], items1.iloc[i]['x1'] tmp_prices_n = pd.DataFrame({ 'name': [tmp_name], 'x0': x0, 'x1': x1, 'y0': y0_temp, 'y1': y1_temp }, index=[0]) prices_n = prices_n.append(tmp_prices_n, ignore_index=True) prices_n = prices_n.sort_values(['x0', 'y1'], ascending=[True, False]) prices_n.reset_index(inplace=True, drop=True) #Draw new layout pdf_boundary_boxes( df=prices_n, path_input="UPLOAD_FOLDER/temp_cat_n.pdf", path_output="UPLOAD_FOLDER/temp_dishes_prices_n.pdf", show_height=False, r=230, g=0, b=0) #return 'Done' return send_from_directory(upload_path, 'temp_dishes_prices_n.pdf')
def load_file_text(self, import_file): """ Import from file types of odt, docx pdf, epub, txt, html, htm. """ text = "" # Import from odt if import_file[-4:].lower() == ".odt": text = self.convert_odt_to_text(import_file) # Import from docx if import_file[-5:].lower() == ".docx": #text = convert(importFile) # uses docx_to_html document = opendocx(import_file) list_ = getdocumenttext(document) text = "\n".join(list_) # Import from epub if import_file[-5:].lower() == ".epub": book = epub.read_epub(import_file) for d in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): #print(d.get_content()) bytes_ = d.get_body_content() string = bytes_.decode('utf-8') text += html_to_text(string) + "\n" # import PDF if import_file[-4:].lower() == '.pdf': fp = open(import_file, 'rb') # read binary mode parser = PDFParser(fp) doc = PDFDocument(parser=parser) parser.set_document(doc) # potential error with encrypted PDF rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): text += lt_obj.get_text() # import from html if import_file[-5:].lower() == ".html" or import_file[-4:].lower() == ".htm": importErrors = 0 with open(import_file, "r") as sourcefile: fileText = "" while 1: line = sourcefile.readline() if not line: break fileText += line text = html_to_text(fileText) QtWidgets.QMessageBox.warning(None, _('Warning'), str(importErrors) + _(" lines not imported")) # Try importing as a plain text file. if text == "": import_errors = 0 try: with open(import_file, "r") as sourcefile: while 1: line = sourcefile.readline() if not line: break try: text += line except Exception as e: #logger.debug("Importing plain text file, line ignored: " + str(e)) import_errors += 1 if text[0:6] == "\ufeff": # associated with notepad files text = text[6:] except Exception as e: QtWidgets.QMessageBox.warning(None, _('Warning'), _("Cannot import ") + str(import_file) + "\n" + str(e)) return if import_errors > 0: QtWidgets.QMessageBox.warning(None, _('Warning'), str(import_errors) + _(" lines not imported")) logger.warning(import_file + ": " + str(import_errors) + _(" lines not imported")) # import of text file did not work if text == "": QtWidgets.QMessageBox.warning(None, _('Warning'), _("Cannot import ") + str(import_file) + "\n" + str(e)) return # Final checks: check for duplicated filename and update model, widget and database nameSplit = import_file.split("/") filename = nameSplit[-1] if any(d['name'] == filename for d in self.source): QtWidgets.QMessageBox.warning(None, _('Duplicate file'), _("Duplicate filename.\nFile not imported")) return entry = {'name': filename, 'id': -1, 'fulltext': text, 'mediapath': None, 'memo': "", 'owner': self.settings['codername'], 'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} cur = self.settings['conn'].cursor() #logger.debug("type fulltext: " + str(type(entry['fulltext']))) cur.execute("insert into source(name,fulltext,mediapath,memo,owner,date) values(?,?,?,?,?,?)", (entry['name'], entry['fulltext'], entry['mediapath'], entry['memo'], entry['owner'], entry['date'])) self.settings['conn'].commit() cur.execute("select last_insert_rowid()") id_ = cur.fetchone()[0] entry['id'] = id_ self.parent_textEdit.append(entry['name'] + _(" imported.")) self.source.append(entry)
def extract_text_from_pdf(pdf_path): ''' Helper function to extract the plain text from .pdf files :param pdf_path: path to PDF file to be extracted (remote or local) :return: iterator of string of extracted text ''' # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/ if not isinstance(pdf_path, io.BytesIO): # extract text from local pdf file with open(pdf_path, 'rb') as fh: try: for page in PDFPage.get_pages( fh, caching=True, check_extractable=True ): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) page_interpreter = PDFPageInterpreter( resource_manager, converter ) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return else: # extract text from remote pdf file try: for page in PDFPage.get_pages( pdf_path, caching=True, check_extractable=True ): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) page_interpreter = PDFPageInterpreter( resource_manager, converter ) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return
def pdf2txt(self): ''' ============================= return : str, text File path ''' # input password = '' pagenos = set() maxpages = 0 # output imagewriter = None rotation = 0 codec = 'UTF-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() infp = open(self.input_path, "rb") if self.output_path == None: self.output_path = self.input_path[:-4] + '_trans.txt' outfp = open(self.output_path, "w", encoding='UTF8') else: outfp = open(self.output_path, "w", encoding='UTF8') #page total num parser = PDFParser(infp) document = PDFDocument(parser) page_total_num = resolve1(document.catalog['Pages'])['Count'] # rsrcmgr = PDFResourceManager(caching=caching) # pdf -> text converter device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) # pdf -> text interpreter interpreter = PDFPageInterpreter(rsrcmgr, device) # pdf -> text start with tqdm(total=page_total_num) as pbar: for page in PDFPage.get_pages(infp, pagenos, maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) pbar.update(1) print('[INFO] pdf -> text') outfp.close() infp.close() return self.output_path
# Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。 laparams = LAParams(detect_vertical=False, word_margin=0.5, line_margin=.1, boxes_flow=0) # 共有のリソースを管理するリソースマネージャーを作成。 resource_manager = PDFResourceManager() # ページを集めるPageAggregatorオブジェクトを作成。 device = PDFPageAggregator(resource_manager, laparams=laparams) # Interpreterオブジェクトを作成。 interpreter = PDFPageInterpreter(resource_manager, device) # 出力用のテキストファイル output_txt = open('output.txt', 'w') def print_and_write(txt): print(txt) output_txt.write(txt) output_txt.write('\n') with open(sys.argv[1], 'rb') as f: # PDFPage.get_pages()にファイルオブジェクトを指定して、PDFPageオブジェクトを順に取得する。 # 時間がかかるファイルは、キーワード引数pagenosで処理するページ番号(0始まり)のリストを指定するとよい。 for page in PDFPage.get_pages(f):
def parse(_path): fp = open(_path, 'rb') # rb以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 praser_pdf = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser_pdf.set_document(doc) doc.set_parser(praser_pdf) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF页面解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器获取内容 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for out in layout: # 判断是否含有get_text()方法,图片之类的就没有 # if hasattr(out,"get_text"): if isinstance(out, LTTextBoxHorizontal): results = out.get_text() # print("results: " + results) with open(r'pdf_val.txt', 'a') as f: if "运输完成情况" in results: target_value = results.split("\n") inland_amount = target_value[10] foreign_amount = target_value[12] print("国内货邮运输量:", inland_amount, "国际货邮运输量:", foreign_amount) f.write("国内货邮运输量:" + inland_amount + ",国际货邮运输量:" + foreign_amount + "\n") f.close() break
def pdftotexts(filename): path_to_pdf = filename # Load your PDF ''' path_to_pdf: is the parameter that will give access to the PDF File we want to extract the content. ''' ''' PDFResourceManager is used to store shared resources such as fonts or images that we might encounter in the files. ''' resource_manager = PDFResourceManager(caching=True) ''' create a string object that will contain the final text the representation of the pdf. ''' out_text = StringIO() ''' UTF-8 is one of the most commonly used encodings, and Python often defaults to using it. In our case, we are going to specify in order to avoid some encoding errors. ''' codec = 'utf-8' """ LAParams is the object containing the Layout parameters with a certain default value. """ laParams = LAParams() ''' Create a TextConverter Object, taking : - ressource_manager, - out_text - layout parameters. ''' text_converter = TextConverter(resource_manager, out_text, laparams=laParams) fp = open(path_to_pdf, 'rb') ''' Create a PDF interpreter object taking: - ressource_manager - text_converter ''' interpreter = PDFPageInterpreter(resource_manager, text_converter) ''' We are going to process the content of each page of the original PDF File ''' for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True): interpreter.process_page(page) ''' Retrieve the entire contents of the “file” at any time before the StringIO object’s close() method is called. ''' text = out_text.getvalue() ''' Closing all the ressources we previously opened ''' fp.close() text_converter.close() out_text.close() with open(str(filename)[:-4] + "_text_generatedbymain.py.txt", 'w') as f: f.write(text) f.close()
def parsePDF(fileLocation, politician, party, topic, useEmptyDB): data = '' # Open files containing indications of fillers around quotes, and non-article text quoteRelatedFillers = open('../Resources/quoteRelatedFillerWords.txt', 'r', encoding='utf-8').readline().split(',') nonArticleFlags = open('../Resources/nonArticleFlags.txt', 'r', encoding='utf-8').readline().split(',') quoteFillers = {'-', '»', '«'} wrongQuoteFlags = set() correctQuoteFlags = set() upcomingCorrectQuoteFlags = set() politicianLastName = politician.split(' ')[-1] # Generate quote fillers to be extracted, flags indicating a quote is by another than the politician of interest, # that the quote is by the politician of interest or that an upcoming quote is of interest, pairing quote fillers, # pronouns and the name of the politician in different combinations for filler in quoteRelatedFillers: quoteFillers.update([ ', ' + filler + '.*' + politician + '.*', ', ' + filler + ' hun.*', ', ' + filler + ' han.*', ', ' + filler + '.*' + politicianLastName + '.*' ]) # Statement made by someone other than the given politician wrongQuoteFlags.update([ ', ' + filler + ' (?!.*' + politician + '|.*hun|.*han|.*' + politicianLastName + ').*' ]) correctQuoteFlags.update([ ', ' + filler + ' .*' + politician + '.*', ', ' + filler + ' .*' + politicianLastName + '.*' ]) upcomingCorrectQuoteFlags.update([ filler + '[ |,].*' + politician, politician + '[ |,].*' + filler, filler + '[ |,].*' + politicianLastName, politicianLastName + '[ |,].*' + filler ]) # Open the indicated PDF file. fp = open(fileLocation, 'rb') rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() # Removing null bytes, generated by "ft", "tf" and "ff" data = data.replace('\0', '') newArticle = True quotes = [] articleTitle, articleText, date = '', '', '' quoteCount, articleCount, articleID = 0, 0, 0 # Use empty dataset files if useEmptyDB: quoteDB = pd.read_csv('../out/empty_db/quote_db.csv', sep=';', encoding='UTF-8', header=0) articleDB = pd.read_csv('../out/empty_db/article_db.csv', sep=';', encoding='UTF-8', header=0) # Open already present dataset files else: quoteDB = pd.read_csv('../out/quote_db.csv', sep=';', encoding='UTF-8', header=0) articleDB = pd.read_csv('../out/article_db.csv', sep=';', encoding='UTF-8', header=0) # Continue from largest article and quote id, if article and/or quote dataset is already populated maxArticleID = articleDB['articleID'].max() articleID = maxArticleID + 1 if not math.isnan(maxArticleID) else 1 maxQuoteID = quoteDB['quoteID'].max() quoteID = maxQuoteID + 1 if not math.isnan(maxQuoteID) else 1 quoteDicts = [] articleDicts = [] # Flags indicating special cases in which quotes might appear correctUpcomingQuote = False quotesInALine = False # Isolate paragraphs for paragraph in data.split('\n\n'): paragraph = paragraph.replace('\n', ' ') # Save article publication dates if 'Id:' in paragraph: date = re.compile('\\w* \\d{2}, \\d{4}').search(paragraph).group() strpDate = datetime.strptime(date, '%B %d, %Y') date = strpDate.strftime('%m/%d/%Y') # Skip non-article text if any(flag in paragraph for flag in nonArticleFlags) or re.search('\\d+\\W\\d+\\W\\d{4}', paragraph) \ or re.search('^\\d+/\\d+$', paragraph) or paragraph == 'København': continue # Identify and extract quotes if paragraph.startswith('- ') or paragraph.startswith('»'): paragraph = paragraph.replace('«', '') # Ignore quote if not from politician in question wrongQuote = False correctQuote = False for wrongQuoteFlag in wrongQuoteFlags: if re.search(wrongQuoteFlag, paragraph): wrongQuote = True # Reset flag to avoid false positive indication of quote of interest quotesInALine = False if not wrongQuote: # Identify whether quote is of interest for correctQuoteFlag in correctQuoteFlags: if re.search(correctQuoteFlag, paragraph): correctQuote = True # Remove 'fillers' around quotes, such as ', siger Martin Henriksen' and strip whitespace if correctQuote or correctUpcomingQuote or quotesInALine: quotesInALine = True for quoteFiller in quoteFillers: paragraph = re.sub(quoteFiller, '', paragraph) paragraph = paragraph.strip() quotes.append(paragraph) # Catch multiple quotes in a row, in question-answer chain elif not paragraph.startswith('Spørgsmål: '): quotesInALine = False correctUpcomingQuote = False # Check if paragraph insinuate an upcoming quote of interest for upcomingCorrectQuoteFlag in upcomingCorrectQuoteFlags: if re.search(upcomingCorrectQuoteFlag, paragraph): correctUpcomingQuote = True if re.search(politician + '.*:$', paragraph.strip()): correctUpcomingQuote = True # Identify the title of the article if newArticle and not re.search('\\d+\\W\\d+\\W\\d{4}', paragraph): articleTitle = paragraph newArticle = False continue # Construct article string from paragraphs, excluding non-article paragraphs generated during PDF extraction articleText += paragraph # End of article indicated with 'The client may distribute' if paragraph.startswith('The client may distribute'): # Save quotes with info in quote dataset, and increment quote ID and quote count for quote in quotes: quoteDicts.append({ 'quoteID': quoteID, 'quote': quote, 'politician': politician, 'date': date, 'party': party, 'articleID': articleID.__str__(), 'topic': topic, 'fan': '', 'articleText': articleText }) quoteCount = quoteCount + 1 quoteID = quoteID + 1 # Save article with articleID in article dataset, and increment article ID and article count articleDicts.append({ 'articleID': articleID, 'topic': topic, 'articleTitle': articleTitle, 'articleText': articleText, 'mediaOutlet': 'ritzau' }) articleID = articleID + 1 articleCount = articleCount + 1 # Reset article text and quotes in article, and indicate the start of a new article newArticle = True quotes.clear() articleText = '' # Append newly parsed quotes and articles to exisiting database quoteDB = quoteDB.append(pd.DataFrame(quoteDicts), sort=False) articleDB = articleDB.append(pd.DataFrame(articleDicts), sort=False) # Remove quote and article duplicates quoteDB.drop_duplicates(subset=['quote', 'politician'], inplace=True) articleDB.drop_duplicates(subset=['articleText'], inplace=True) print('Quotes for politician:', quoteCount, '\nArticles for politician:', articleCount, '\nTotal quotes:', len(quoteDB.index), '\nTotal articles:', len(articleDB.index)) # Save updated databases quoteDB.to_csv('../out/quote_db.csv', sep=';', encoding='UTF-8', index=False, quoting=1) articleDB.to_csv('../out/article_db.csv', sep=';', encoding='UTF-8', index=False, quoting=1)
def exportPDF(self, infile, outfile): # debug option #debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option #outfile = None outtype = 'text' imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() '''for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v)''' '''# PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug #''' rsrcmgr = PDFResourceManager(caching=caching) if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout #outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) #for fname in args: fp = file(infile, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def flight_plan_reader(self): plan_headers = [ 'Strip width', 'Lateral overlap', 'Run spacing', 'Forward overlap', 'Photo base', 'Total length', 'Total lines', 'Total photos', 'Planned By', 'Survey Time' ] for key in self.flight_plan_dic.keys(): flight_plan_name = key file_path = self.flight_plan_dic[key][-1] resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(file_path, 'rb') as file: for page in PDFPage.get_pages(file, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() page_split_output = text.split( ' UNCONTROLLED DOCUMENT WHEN PRINTED ') first_page = page_split_output[0] script = None params_index = OrderedDict() param_values = {} backup_sys = None if 'VQ' in self.flight_plan_dic[key][0]: param_values['System'] = 'VQ' backup_sys = 'VQ' script = first_page[first_page.find('LiDAR'):first_page. rfind('LiDAR')] if len(script) == 0: # print(first_page) try: script = first_page[ first_page.find('Lidar:'):first_page. rfind('Images:Collect images')].split(':')[1] except IndexError: script = None elif 'H68' in self.flight_plan_dic[key][0]: script = 'H68 Standard' param_values['System'] = 'Harrier' backup_sys = 'H68' if script is None: print( "You will need to enter flight plan info manually; non-standard setup found." ) else: last_page = page_split_output[-1] for param in plan_headers: params_index[param] = last_page.find(param) """ if last_page.find(param) == -1: params_index[param] = None""" for i, param in enumerate(plan_headers): param_value = None if i < (len(plan_headers) - 1): # print(param, "has i", i, "and next param is", plan_headers[i+1]) next_param = plan_headers[i + 1] index_current = params_index[param] index_next = params_index[next_param] cut_up_string = last_page[index_current:index_next] param_value = cut_up_string.split(':')[-1].strip(' ') # print(param_value) elif i == (len(plan_headers) - 1): # print("Final param is", param) index_current = params_index[param] end = last_page.find('[') cut_up_string = last_page[index_current:end] param_value = cut_up_string.split(':')[-1].strip(' ') # print(param_value) param_values[param] = param_value param_values['script'] = script param_values['initial_vals'] = self.flight_plan_dic[ flight_plan_name] param_values['System'] = backup_sys self.flight_plan_dic[flight_plan_name] = param_values
def convert_pdf_to_text(pdf_path: Union[object, str], docketnum: str) -> str: """ Takes path (or pathlib Path object) to a PDF file, docketnum and returns text inside PDF""" # SET PATHS extracted_text_path = extracted_text_path_gen(dirs["extracted_text"], docketnum) logging.info(f"Converting pdf to text for docket {docketnum}...") password = "" extracted_text = "" # Open and read the pdf file in binary mode fp = open(pdf_path, "rb") # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object try: document = PDFDocument(parser, password) except Exception as e: logging.error("Something went wrong during conversion") logging.exception(e) logging.info( "Returning no extracted text for docket {}".format(docketnum)) return extracted_text # Check if document is extractable, if not abort if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a device object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) # Ok now that we have everything to process a pdf document, lets process it page by page for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() # close the pdf file fp.close() with open(extracted_text_path, "wb") as fout: fout.write(extracted_text.encode("utf-8")) logging.info("Text extracted successfully") return extracted_text
def pdf_to_txt_miner(folder, password): #获取指定目录下的所有文件 files = os.listdir(folder) pdfFiles = [f for f in files if f.endswith('.pdf')] #获取pdf类型的文件,放到一个列表中 for pdfFile in pdfFiles: print(pdfFile) #将目录和文件合并成一个路径 os.path.join('root','test','runoob.txt') ##root/test/runoob.txt pdfPath = os.path.join(folder, pdfFile) #设置将要转换后存放word文件的路径 wdPath = os.path.join(txtpath, pdfFile) #判断是否已经存在对应的文件,如果不存在就加入到存放的路径中去 if wdPath[-4:] != '.txt': wdPath = wdPath + '.txt' fn = open(path + "/{}".format(pdfFile), 'rb') #创建一个PDF文本档分析器:PDFParser parser = PDFParser(fn) #创建一个PDF文档:PDFDocumeng doc = PDFDocument() #链接分析器与文档 parser.set_document(doc) doc.set_parser(parser) #提供出事话的密码,如果没有密码,输入空字符串 doc.initialize('') #检测文档是否提供txt转换,不提供就直接忽略 if not doc.is_extractable: print('PDFTextExtractionNotAllowed') else: #创建PDF资源管理器:PDFResourceManager resource = PDFResourceManager() #创建一个PDF参数分析器:;AParams laparams = LAParams() #创建聚合器,用于读取文档的对象:PDFPageAggregator device = PDFPageAggregator(resource, laparams=laparams) #创建解释器,对文档编码,解释成python能够识别的格式:PDFPageInterpreter interpreter = PDFPageInterpreter(resource, device) #doc.get_pages()是获取page列表的一个方法 num_page, num_image, num_Text = 0, 0, 0 num = 0 for page in doc.get_pages(): # num+=1; # if (num!=2): # continue; pdf_str = '' #利用解释器的peocess_page()方法解析单独页数 interpreter.process_page(page) layout = device.get_result() for out in layout: if isinstance(out, LTTextBoxHorizontal): num_Text += 1 # print(type(out.get_text())) # if out.get_text().strip().replace(" ","")=="摘要": # print("找到摘要位置") pdf_str += out.get_text().strip() f = open(wdPath, 'a', encoding='utf-8') f.write(out.get_text() + '\n') if isinstance(out, LTImage): num_image += 1 f = open(wdPath, 'a', encoding='utf-8') f.write(out.get_text() + '\n')
#maxpages = 0 manager = PDFResourceManager(caching=caching) if case == 'txt': output = io.StringIO() converter = TextConverter(manager, output, codec=codec, laparams=laparams) if case == 'HTML': output = io.BytesIO() converter = HTMLConverter(manager, output, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for index, page in enumerate( PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True)): interpreter.process_page(page) convertedPDF = output.getvalue() infile.close() converter.close() output.close()
def pdfread(pdfPath): with open(pdfPath, 'rb') as fp: try: print(pdfPath) #用文件对象创建一个PDF文档分析器 parser = PDFParser(fp) #创建一个PDF文档 doc = PDFDocument() #分析器和文档相互连接 parser.set_document(doc) doc.set_parser(parser) #提供初始化密码,没有默认为空 doc.initialize() #检查文档是否可以转成TXT,如果不可以就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: #创建PDF资源管理器,来管理共享资源 rsrcmagr = PDFResourceManager() #创建一个PDF设备对象 laparams = LAParams() #将资源管理器和设备对象聚合 device = PDFPageAggregator(rsrcmagr, laparams=laparams) #创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmagr, device) allContent = '' last_para = '' result = '' for page in doc.get_pages(): interpreter.process_page(page) #接收该页面的LTPage对象 layout = device.get_result() #这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象 #一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等一些对像 #想要获取文本就得获取对象的text属性 for x in layout: try: if (isinstance(x, LTTextBoxHorizontal)): result = x.get_text() # 去掉pdf文件读取的各种换行符 result = result.replace('\n', '') # 去掉参考文献引用 result = re.sub('\[(\d+\,* ?-?)+\]', '', result) # 无序列表换行 result = result.replace('∙', '\n∙') # 去掉参考文献 if re.findall( '^references?', last_para.lower().replace( ' ', '')) != [] or re.findall( '^references?', result.lower().replace( ' ', '')) != []: return allContent # 去掉页脚页码页眉以及内容过少的表格 if re.findall( '^Authorized licensed use limited to:', result) == [] and re.findall( '©', result) == [] and re.findall( 'Publication date', result) == [] and re.findall( '\d\:\d', result ) == [] and re.findall( '(et al.)$', result ) == [] and len(result) > 5: allContent = allContent + '\n' + result # print(result) except Exception as e: print(e) last_para = result return allContent except Exception as e: print('文档读取失败:' + str(e))
def convert_pdf_to_txt(self, path): start = timer() codec = 'utf-8' rsrcmgr = PDFResourceManager() retstr = io.StringIO() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=self.laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) # Analyze first page to get a feel of what's going on try: first_page = next(pages) interpreter.process_page(first_page) except StopIteration: print("The pdf document may be damaged") return # Save pages to RAM to interpret only the last 3 ones temp_pages = [] # Get the first page's text text = retstr.getvalue() num_signature_points = 1 if 'ΠΕΡΙΕΧΟΜΕΝΑ' in text: indexes = re.findall('[0-9] \n', text[120:350]) num_signature_points = len(indexes) for page in pages: temp_pages.append(page) # Goes through the pages in reverse until if finds the stopword(s) signature_points_found = 0 for page in reversed(temp_pages): interpreter.process_page(page) current_text = retstr.getvalue() if 'Οι Υπουργοί' in current_text or Helper.date_match().findall(current_text) \ or 'ΟΙ ΥΠΟΥΡΓΟΙ' in current_text: signature_points_found += 1 if signature_points_found == num_signature_points: break text = retstr.getvalue() fp.close() device.close() end = timer() print("{} seconds elapsed for parsing this pdf's text.".format(end - start)) return text
def extract_introduction(proceedings: List[Dict[str, Any]]): """ extract introductions from the proceedings papers Args: proceedings (List[Dict[str, Any]]): {'session': ●●●, 'title': ●●●, 'url': ●●●, 'authors': [●●●, ...], } """ chaps = ['はじめに', '序論', '背景', '背景と目的', 'Introduction'] laparams = LAParams() laparams.detect_vertical = True for paper_dict in tqdm.tqdm(proceedings): manager = PDFResourceManager() paper_pdf = requests.get(paper_dict['url']) instr = BytesIO() instr.write(paper_pdf.content) outstr = StringIO() with TextConverter(manager, outstr, laparams=laparams) as device: interpreter = PDFPageInterpreter(manager, device) try: for page in PDFPage.get_pages(instr, set(), maxpages=1, caching=True, check_extractable=True): interpreter.process_page(page) first = outstr.getvalue() print(first) intro = '' for chap in chaps: cn = '1 {}'.format(chap) if cn in first: top = first.find(cn) + len(cn) if '.\n\n2' in first: intro = first[top:first.find('.\n\n2')] elif '.\n\n2' in first: intro = first[top:first.find('.\n\n2')] elif '。\n\n2' in first: intro = first[top:first.find('。\n\n2')] else: intro = first[top:] break cn = '1{}'.format(chap) if cn in first: top = first.find(cn) + len(cn) if '.2' in first: intro = first[top:first.find('.2')] elif '.2' in first: intro = first[top:first.find('.2')] elif '。2' in first: intro = first[top:first.find('。2')] else: intro = first[top:] break except Exception as e: logger.error('error: {} url:{}'.format(e.args, paper_dict['url'])) # intro = intro.replace('\r', '').encode('utf-8').decode() intro = re.sub(r'\n+', '\n', intro) paper_dict['introduction'] = intro time.sleep(1.5 + random.random()) instr.close() outstr.close()
def PDFReader(path, pages=None): PagePosDict = [] PagePosDict1 = [] PagePosDictCord = defaultdict() PagePosDictPages = defaultdict() PagePosDictJuris = [] all_pages = [] all_pages_juris = [] all_page_set = [] if not pages: pagenums = set() else: pagenums = set(pages) manager = PDFResourceManager() fd = open(path, 'rb') page_no = 0 for page in PDFPage.get_pages(fd, pagenums): output = StringIO() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) interpreter.process_page(page) doc = output.getvalue() all_pages.append(doc) if doc[:18] == "Service of Process": continue else: page_no = page_no + 1 if page_no <= 5: all_pages_juris.append(doc) converter.close() output.close() all_page_set.append(' '.join(all_pages_juris)) fd.close() i = 0 pdf = pdfquery.PDFQuery(path) try: while i <= 30: pdf.load(i) JQuery = pdf.pq('LTPage') if JQuery.text().find('Service of Process Transmittal') >= 0: i += 1 continue j = 0 LineLength = len(JQuery("LTTextLineHorizontal")) BoxLength = len(JQuery("LTTextBoxHorizontal")) if LineLength == 0: return 1, PagePosDict, PagePosDict1, PagePosDictCord, all_pages, all_page_set, PagePosDictJuris, PagePosDictPages if LineLength < BoxLength: NetLength = BoxLength else: NetLength = LineLength PagePosDictPage = defaultdict() while (j < NetLength): if j < LineLength and i <= 30: PagePosDict.append( JQuery(JQuery("LTTextLineHorizontal")[j]).text()) cordinates = list() cordinates.append(i) cord = JQuery( JQuery("LTTextLineHorizontal")[j]).attr('bbox') for a in ['[', ']']: cord = cord.replace(a, '') for a in cord.split(', '): cordinates.append(float(a)) PagePosDictCord[tuple(cordinates)] = JQuery( JQuery("LTTextLineHorizontal")[j]).text() if j < BoxLength and i <= 30: PagePosDict1.append( JQuery(JQuery("LTTextBoxHorizontal")[j]).text()) cordinates = list() cordinates.append(i) cord = JQuery( JQuery("LTTextBoxHorizontal")[j]).attr('bbox') for a in ['[', ']']: cord = cord.replace(a, '') for a in cord.split(', '): cordinates.append(float(a)) PagePosDictPage[tuple(cordinates)] = JQuery( JQuery("LTTextBoxHorizontal")[j]).text() if j < BoxLength and i <= 7: PagePosDictJuris.append( JQuery(JQuery("LTTextBoxHorizontal")[j]).text()) j += 1 PagePosDictPages[i] = PagePosDictPage i += 1 except Exception, e: return 0, PagePosDict, PagePosDict1, PagePosDictCord, all_pages, all_page_set, PagePosDictJuris, PagePosDictPages
def post(self, request, *args, **kwargs): file_serializer = FileSerializer(data=request.data) if file_serializer.is_valid(): file_serializer.save() file_path = "D:/file/fileupload" + file_serializer.data["file"] FilePointer = open(file_path, "r") t = file_path.split(".") if (t[1] == "pdf"): bt1 = [ 'define', 'describe', 'draw', 'find', 'identify', 'label', 'list', 'locate', 'match', 'memorise', 'name', 'recall', 'recite', 'recognize', 'relate', 'reproduce', 'select', 'state', 'tell', 'write' ] bt2 = [ 'compare', 'convert', 'demonstarte', 'describe', 'discuss', 'distinguish', 'explain', 'find out more information about', 'generalize', 'interpret', 'outline', 'paraphrase', 'predict', 'put into your own words', 'relate', 'restate', 'summarize', 'translate', 'visualize' ] bt3 = [ 'apply', 'calculate', 'change', 'choose', 'complete', 'construct', 'examine', 'illustrate', 'interpret', 'make', 'manipulate', 'modify', 'produce', 'put into practice', 'put together', 'solve', 'show', 'translate', 'use' ] bt4 = [ 'advertise', 'analyse', 'categoriase', 'compare', 'contrast', 'deduce', 'differenciate', 'distinguish', 'examine', 'explain', 'identify', 'investigate', 'seperate', 'subdivide', 'take apart' ] bt5 = [ 'argue', 'assess', 'choose', 'compose', 'construct', 'create', 'criticise', 'critique', 'debate', 'decide', 'defend', 'design', 'determine', 'device', 'discuss', 'estimate', 'evaluate', 'formulate', 'imagine', 'invent', 'judge', 'justify', 'plan', 'predict', 'prioritise', 'propose', 'rate', 'recommend', 'select', 'value' ] bt6 = [ 'add to', 'argue', 'assess', 'choose', 'combine', 'compose', 'construct', 'create', 'debate', 'decide', 'design', 'determine', 'devise', 'discuss', 'forcast', 'formulate', 'hypothesise', 'imagine', 'invent', 'judge', 'justify', 'originate', 'plan', 'predict', 'priortise', 'propose', 'rate', 'recommend', 'select', 'verify' ] bt = { 'bt1': bt1, 'bt2': bt2, 'bt3': bt3, 'bt4': bt4, 'bt5': bt5, 'bt6': bt6 } my_file = os.path.join(file_path) log_file = os.path.join("D:/file/fileupload/media/log.txt") password = "" extracted_text = "" # Open and read the pdf file in binary mode fp = open(my_file, "rb") # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, password) # Check if document is extractable, if not abort if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) # Ok now that we have everything to process a pdf document, lets process it page by page for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): extracted_text += lt_obj.get_text() # close the pdf file fp.close() """ data = extracted_text.encode("utf-8").lower() read1 = data.split("q") btperq = [] for i in range(1, len(read1)): btlevellist = [] read1[i] = read1[i].translate(None, digits) read1[i] = re.sub('[.,!?]', '', read1[i]) t = read1[i].split(" ") for word in range(len(t)): for values in bt.values(): for keywords in values: if (t[word] == keywords): btlevellist.append(bt.keys()[bt.values().index(values)]) btperq.append(btlevellist) senddata = {'question': read1, 'btlevel': btperq, 'list': zip(read1, btperq)} return Response(senddata, template_name='file.html') """ return HttpResponse(nextracted_text.encode("utf-8")) #return Response("it is pdf") #response = HttpResponse(FilePointer) #response['Content-Disposition'] = 'attachment; filename=NameOfFile' #return response elif (t[1] == "jpg"): image = cv2.imread(file_path) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # check to see if we should apply thresholding to preprocess the # image #if args["preprocess"] == "thresh": gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] # make a check to see if median blurring should be done to remove # noise # elif args["preprocess"] == "blur": #gray = cv2.medianBlur(gray, 3) # write the grayscale image to disk as a temporary file so we can # apply OCR to it filename = "{}.png".format(os.getpid()) cv2.imwrite(filename, gray) # load the image as a PIL/Pillow image, apply OCR, and then delete # the temporary file text = pytesseract.image_to_string(Image.open(filename)) os.remove(filename) return HttpResponse(text) #return Response(file_serializer.data, status=status.HTTP_201_CREATED) else: bt1 = [ 'define', 'describe', 'draw', 'find', 'identify', 'label', 'list', 'locate', 'match', 'memorise', 'name', 'recall', 'recite', 'recognize', 'relate', 'reproduce', 'select', 'state', 'tell', 'write' ] bt2 = [ 'compare', 'convert', 'demonstarte', 'describe', 'discuss', 'distinguish', 'explain', 'find out more information about', 'generalize', 'interpret', 'outline', 'paraphrase', 'predict', 'put into your own words', 'relate', 'restate', 'summarize', 'translate', 'visualize' ] bt3 = [ 'apply', 'calculate', 'change', 'choose', 'complete', 'construct', 'examine', 'illustrate', 'interpret', 'make', 'manipulate', 'modify', 'produce', 'put into practice', 'put together', 'solve', 'show', 'translate', 'use' ] bt4 = [ 'advertise', 'analyse', 'categoriase', 'compare', 'contrast', 'deduce', 'differenciate', 'distinguish', 'examine', 'explain', 'identify', 'investigate', 'seperate', 'subdivide', 'take apart' ] bt5 = [ 'argue', 'assess', 'choose', 'compose', 'construct', 'create', 'criticise', 'critique', 'debate', 'decide', 'defend', 'design', 'determine', 'device', 'discuss', 'estimate', 'evaluate', 'formulate', 'imagine', 'invent', 'judge', 'justify', 'plan', 'predict', 'prioritise', 'propose', 'rate', 'recommend', 'select', 'value' ] bt6 = [ 'add to', 'argue', 'assess', 'choose', 'combine', 'compose', 'construct', 'create', 'debate', 'decide', 'design', 'determine', 'devise', 'discuss', 'forcast', 'formulate', 'hypothesise', 'imagine', 'invent', 'judge', 'justify', 'originate', 'plan', 'predict', 'priortise', 'propose', 'rate', 'recommend', 'select', 'verify' ] bt = { 'bt1': bt1, 'bt2': bt2, 'bt3': bt3, 'bt4': bt4, 'bt5': bt5, 'bt6': bt6 } data = FilePointer.read() data = data.lower() read1 = data.split("q") btperq = [] for i in range(1, len(read1)): btlevellist = [] read1[i] = read1[i].translate(None, digits) read1[i] = re.sub('[.,!?]', '', read1[i]) t = read1[i].split(" ") for word in range(len(t)): for values in bt.values(): for keywords in values: if (t[word] == keywords): btlevellist.append( bt.keys()[bt.values().index(values)]) btperq.append(btlevellist) senddata = { 'question': read1, 'btlevel': btperq, 'list': zip(read1, btperq) } return Response(senddata, template_name='file.html') else: return Response(file_serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def create_candidates(path, sel_id, min_req, desire_req): """ We start by getting text from resumes uploaded to AWS S3 bucket rosev0 In case of pdf and docx we were able to process bytes making it easier to handle and make an inference. On the other hand, we have a processed doc files with antiword, sometimes the doc wiull not be processed in case of having weird format AWS S3 bucket: rosev0 mail_user / replace @ with _ chilean date and selection name """ #connecting to AWS S3 s3 = boto3.resource("s3", region_name='us-east-2', aws_access_key_id=os.environ.get('AWS_KEY'), aws_secret_access_key=os.environ.get('AWS_SECRET')) # loading models work_nlp = spacy.load('selection/models/work') ed_nlp = spacy.load('selection/models/education') per_nlp = spacy.load('selection/models/personal') rose_bucket = s3.Bucket(r'rosev0') candidates = [] for resume in rose_bucket.objects.filter(Prefix=path): key = resume.key body = resume.get()['Body'].read() buffer = io.BytesIO() buffer.write(body) ext = re.search('\.[a-z]+$', key) print(key) ###body comes in binary stream, we have to decode it if ext == None: continue elif ext.group() == '.docx': document = Document(buffer) text = "\n".join( [paragraph.text for paragraph in document.paragraphs]) #print(string) elif ext.group() == '.pdf': rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(buffer, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) #fp.close() #device.close() #retstr.close() text = retstr.getvalue() #print(string) elif ext.group() == '.doc': #LINUX version handles damaged files and text in docs split = str(key).split('/') #replace special characters, linux problem reading path filename = str(split[-1]).replace('$', '_').replace('#', '_') pathdoc = 'selection/tmp/' + filename print('trying download in ' + pathdoc) rose_bucket.download_file(key, pathdoc) #doc_text = os.system('antiword "' + pathdoc + '"') try: output = subprocess.check_output('antiword "' + pathdoc + '"', shell=True) text = output.decode('utf-8') except: continue if text != None: doc_work = work_nlp(text) doc_ed = ed_nlp(text) doc_per = per_nlp(text) d = {} results = [] # first we create list for work classes, we need to procces them, get them together comp_work = [] desig_work = [] years_work = [] other_work = [] desig_ind = [] years_ind = [] idioms = [] skills = [] comb = [] # then we do the same with college attributes grad_ed = [] colleges = [] degrees = [] certif = [] # then personal names = [] locations = [] mails = [] phones = [] #create array with entity text from algorithm inference for ent in doc_work.ents: for value in [ent.text]: if ent.label_ == 'companies worked at': comp_work.append(value) elif ent.label_ == 'designation': desig_work.append(value) desig_ind.append(ent.start_char) elif ent.label_ == 'years of experience': years_work.append(value) years_ind.append(ent.start_char) elif ent.label_ == 'idioms': idioms.append(value) elif ent.label_ == 'skills': skills.append(value) else: other_work.append([ent.label_, value]) results.append([ent.label_, value, text.index(value)]) for ent in doc_ed.ents: for value in [ent.text]: if ent.label_ == 'graduation year': grad_ed.append(value) elif ent.label_ == 'college': colleges.append(value) elif ent.label_ == 'degree': degrees.append(value) elif ent.label_ == 'certifications': certif.append(value) results.append([ent.label_, value, text.index(value)]) for ent in doc_per.ents: for value in [ent.text]: if ent.label_ == 'name': names.append(value) elif ent.label_ == 'location': locations.append(value) elif ent.label_ == 'mail': mails.append(value) elif ent.label_ == 'phone': phones.append(value) results.append([ent.label_, value, text.index(value)]) #Creating list of indexes, we have different list formats, a little messy #Why? because year of expereince is involved in the sequence target ind = 0 #desig_ind = [text.index(x[1]) for x in desig_work] #years_ind = [text.index(x[1]) for x in years_work] print( 'Validating same lenght of ind and entities, desig_work_years: ' + str(len(desig_work)) + ' ' + str(len(comp_work)) + ' ' + str(len(years_work))) ### SET YEARS OF EXPERIENCE -- how many years have passed since the first job experience? working_years = re.findall("[0-9]{4}", str(years_work)) if_current = [ value.replace("'", "") for value in re.findall("([a-zA-Z]+')", str(years_work)) if value != "experience'" ] val_current = 0 for value in if_current: for value2 in current: match = SequenceMatcher(None, value.lower(), value2).ratio() if match > 0.8: val_current = val_current + 1 if len(working_years) != 0: year_min = min(working_years) if val_current > 0: year_max = datetime.now().strftime('%Y') else: year_max = max(working_years) experience = int(year_max) - int(year_min) ### SETTING IDIOMS - we remove word idioms and create list with idioms c_idioms = [ value for value in idioms if SequenceMatcher(None, value.lower(), 'idioma').ratio() < 0.8 ] print(c_idioms) ### SETTING TYPE OF DEGREE - is is tech, profesional or postgraduate pro_type = [] if 'técnic' in str(degrees).lower() or 'tecnic' in str( degrees).lower(): pro_type.append('Técnico') elif 'msc' in str(degrees).lower() or 'master' in str( degrees ).lower() or 'magister' in str(degrees).lower().replace( 'í', 'i') or 'maestría' in str(degrees).lower().replace( 'í', 'i'): pro_type.append('Master') elif 'phd' in str(degrees).lower() or 'doctor' in str( degrees).lower() or 'doctorado' in str(degrees).lower(): pro_type.append('Doctorado') elif 'universidad' in str(degrees).lower(): pro_type.append('Profesional') else: pro_type.append('-') ### SETTING LOCATION INDICATOR - is the candidate inside the minimum region if len[min_locs] > 0: if len(locations) > 0: location = [] if len(locations ) >= 2 and locations[1] not in locations[0]: adress_raw = locations[0] + ' ' + locations[1] else: adress_raw = locations[0] for car in googlemaps_reserved: adress = str(adress_raw).replace(car, '') endpoint = 'https://maps.googleapis.com/maps/api/geocode/json?address=' + adress.replace( ' ', '+') + '&key=' + os.environ.get('MAPS_KEY') try: get_location = requests.get( endpoint).json()['results'][0] for comp in get_location['address_components']: if comp['types'][ 0] == 'administrative_area_level_1': location.append(comp['long_name']) if comp['types'][0] == 'country': location.append(comp['long_name']) except: loc_ind = 2 continue for loc in min_locs: min_loc = [] endpoint2 = 'https://maps.googleapis.com/maps/api/geocode/json?address=' + loc.replace( ' ', '+') + '&key=' + os.environ.get('MAPS_KEY') get_minloc = requests.get( endpoint2).json()['results'][0] for comp in get_minloc['address_components']: if comp['types'][ 0] == 'administrative_area_level_1': min_loc.append(comp['long_name']) if comp['types'][0] == 'country': min_loc.append(comp['long_name']) if len(min_loc) == 1: state_country = location[1] == min_loc[0] loc_ind = 1 if state_country == True else 0 elif len(min_loc) == 2: state_country = location[1] == min_loc[1] state_region = location[0] == min_loc[0] loc_ind = 1 if state_country == True and state_region == True else 0 else: loc_ind = 2 else: loc_ind = 0 ### SETTING SKILLS AND CERTIFICATION SCORE - high score cand_data = { "exp": experience, "type": pro_type, "idioms": c_idioms, "skills": skills, "location": locations, "companies": comp_work, "phone": phones, "mail": mails, "designation": desig_work, "college": colleges, "graduation": grad_ed, "certficiations": certif } ### ADD RANKING HERE cand_info = {"data": cand_data, "rank": random.randint(1, 100)} cand = { "name": names[0] if len(names) > 0 else 'Desconocido', "mail": mails[0] if len(mails) > 0 else None, "info": cand_info, "selection": sel_id, } candidates.append(cand) #print(candidates) #headers = {'Content-Type': 'application/json'} #post_candidates = requests.post('http://localhost:8000/selection/create_candidate/', headers = headers, data=json.dumps(candidates)) #print('RESPONSE TEXT: ', post_candidates.text) return candidates
def extract_text_from_pdf(self, way='pdfminer', outputType="text"): """ ---------- Function ---------- * Opens PDF page by page and stores text inside text object -------- INPUT -------- pdf_path : path to the pdf way : library to extract text from pdf outputType : (default) Text "text": (default) plain text with line breaks. No formatting, no text position details, no images. "html": creates a full visual version of the page including any images. This can be displayed with your internet browser. "dict": same information level as HTML, but provided as a Python dictionary. See TextPage.extractDICT() for details of its structure. "rawdict": a super-set of TextPage.extractDICT(). It additionally provides character detail information like XML. See TextPage.extractRAWDICT() for details of its structure. "xhtml": text information level as the TEXT version but includes images. Can also be displayed by internet browsers. "xml": contains no images, but full position and font information down to each single text character. Use an XML module to interpret. ------- RETURN ------- if pdfminer: text : (str) raw text object if fitz textDataList : (list) List of text inside the pdf with size equals number of pages and index signifying page number. """ """ Converting PDF to Text""" logging.info('Inside extract_text_from_pdf') if way == 'pdfminer': resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(self.path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() # if the text is non zero if text: logging.info('Successfully extracted text - Exiting extract_text_from_pdf') return text if way == 'fitz': Pdfdocument = fitz.open(self.path) textDataList = [] for pagenumber in range(Pdfdocument.pageCount): page = Pdfdocument[pagenumber] textDataList.append(page.getText(outputType)) return textDataList logging.info('PDF was not readable - Exiting extract_text_from_pdf')
doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) password = '' doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for element in layout: if isinstance(element, LTTextBoxHorizontal): print(element.get_text())
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-S': stripcontrol = True elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def extract_pdf_data(fp, test_proportion=0, labels={}, session=None): """ Get PDF data from a file. TODO why is this a standalone function? :param fp: A file pointer to the PDF. :param labels: Correct metadata labels for this document. :param session: A SQLAlchemy session, for saving. :return: A Document object. """ filename = os.path.split(fp.name)[-1] parser = PDFParser(fp) pdf = PDFDocument(parser) parser.set_document(pdf) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(pdf) document = Document(filename=filename, is_test=random() < test_proportion) for key in labels: setattr(document, key, labels[key]) if session: session.add(document) for i, page in enumerate(pages): # TODO: figure out how to get the number of pages directly from pages object document.num_pages = i + 1 interpreter.process_page(page) layout = device.get_result() boxes = [obj for obj in layout if isinstance(obj, LTTextBox)] for b in boxes: box = Box(document=document, page=i, x0=b.bbox[0], y0=b.bbox[1], x1=b.bbox[2], y1=b.bbox[3], vertical=isinstance(b, LTTextBoxVertical)) if session: session.add(box) lines = [obj for obj in b if isinstance(obj, LTTextLine)] for l in lines: text = re.sub(r'\(cid:\d+\)', "", l.get_text()).strip() if len(text) > 0: vertical = isinstance(l, LTTextLineVertical) line = Line(box=box, document=document, x0=l.bbox[0], y0=l.bbox[1], x1=l.bbox[2], y1=l.bbox[3], text=text, vertical=vertical, page=i) if session: session.add(line) # do the whole file on one transaction so we can restart # easily if necessary if session: session.commit() return document
def pdf_to_str(pdf_filepath): """Returns the contents of pdf as a string.""" # Code is taken and modified from: # https://gist.github.com/vinovator/c78c2cb63d62fdd9fb67 # pdfTextMiner.py # Python 2.7.6 # For Python 3.x use pdfminer3k module # This link has useful information on components of the program # https://euske.github.io/pdfminer/programming.html # http://denis.papathanasiou.org/posts/2010.08.04.post.html ''' This is what we are trying to do: 1) Transfer information from PDF file to PDF document object. This is done using parser 2) Open the PDF file 3) Parse the file using PDFParser object 4) Assign the parsed content to PDFDocument object 5) Now the information in this PDFDocumet object has to be processed. For this we need PDFPageInterpreter, PDFDevice and PDFResourceManager 6) Finally process the file page by page ''' # my_file = os.path.join("./" + pdf_filepath) password = "" extracted_text = "" # Open and read the pdf file in binary mode fp = open(pdf_filepath, "rb") # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, password) # Check if document is extractable, if not abort if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) # Ok now that we have everything to process a pdf document, lets process it page by page for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() #close the pdf file fp.close() # print (extracted_text.encode("utf-8")) return extracted_text