def runall(path): docfile_list = getalldocfilename(path) # 获取文件夹内所有文件的名字和路径 pdffile_list = getallpdffilename(path) htmlfile_list = getallhtmlfilename(path) emlfile_list = getallemlfilename(path) # 将所有word文档转换为txt格式,引用封装的doc2txt模块 for docfile in docfile_list: if docfile[-4:] == '.DOC': doc_file = docfile[:-4] + '.doc' else: doc_file = docfile d2t.convert_doc_to_txt(doc_file) # 将所有pdf文档转换为txt格式,引用封装的pdf2txt模块 for pdffile in pdffile_list: # print pdffile pdf2t = pdffile[:-4] # 以后缀.pdf切词 f = open(pdf2t + '.txt', 'w+') # 以txt格式保存 f.write(p2t.convert_pdf_to_txt(pdffile)) f.close() # 将所有html文档转换为txt格式 for htmlfile in htmlfile_list: html2t = htmlfile[:-5] fout = open(html2t + '.txt', 'w') fin = open(htmlfile, 'r') strfile = fin.read() # print chardet.detect(strfile) # 文本格式的编码方式统一为utf-8 if (chardet.detect(strfile)['encoding'] == 'GB2312'): str_file = h2t.html2text( strfile.decode("gbk", 'ignore').encode("utf-8", 'ignore')) if ((chardet.detect(strfile)['encoding'] == 'utf-8') or (chardet.detect(strfile)['encoding'] == 'UTF-8-SIG')): str_file = h2t.html2text(strfile) for t in str_file: txt = re.sub(r'[# * | -]?', '', t) # drop #* fout.write(txt) fout.close() # 将所有email文档转换为txt格式 for emlfile in emlfile_list: fp = open(emlfile, "r") msg = email.message_from_file(fp) # 创建消息对象 email2t = emlfile[:-4] fout = open(email2t + '.txt', 'w') emltext = 'content:{}'.format(e2t.convert_eml_to_txt(msg)) # print chardet.detect(emltext) if (chardet.detect(emltext)['encoding'] == 'GB2312'): str_file = h2t.html2text( emltext.decode("gbk", 'ignore').encode("utf-8", 'ignore')) if ((chardet.detect(emltext)['encoding'] == 'utf-8') or (chardet.detect(strfile)['encoding'] == 'UTF-8-SIG')): str_file = h2t.html2text(emltext) print str_file for t in str_file: txt = re.sub(r'[# * | ]?', '', t) # drop #* fout.write(txt) fout.close()
def handle_pdffiles(files): for pdffile in files: # print pdffile pdf2t = pdffile[:-4] # 以后缀.pdf切词 if (not os.path.exists(pdf2t + ".txt")): # 判断是否存在,如果存在就不处理 # print os.path.exists(pdf2t + ".txt") f = open(pdf2t + '.txt', 'w+') # 以txt格式保存 f.write(p2t.convert_pdf_to_txt(pdffile)) f.close()
def admin_case_entry(): allowed_extentions = set(['pdf']) def allowed_file(filename): return '.' in filename and \ filename.rsplit('.', 1)[1] in allowed_extentions case_form = CaseEntryForm() if case_form.validate_on_submit(): file = flask.request.files["file"] if not file: flask.flash('No file added.') return flask.redirect(flask.url_for('admin_config_update')) if not allowed_file(file.filename): flask.flash('File must be a pdf.') return flask.redirect(flask.url_for('admin_config_update')) case_data = file.read() file_name = secure_filename(file.filename) blob_io = files.blobstore.create(mime_type=file.content_type, _blobinfo_uploaded_filename=file_name) with files.open(blob_io, 'a') as f: f.write(case_data) files.finalize(blob_io) blob_key = files.blobstore.get_blob_key(blob_io) case = model.Case(reward=case_form.reward.data, date_of_case=case_form.date.data, case_file=blob_key) case.put() document = search.Document(fields=[ search.TextField(name='case', value=convert_pdf_to_txt(file)), search.TextField(name='case_key', value=case.key.urlsafe()) ]) try: index = search.Index(name="Main") index.put(document) index = search.Index(name="Backup") index.put(document) except search.Error: logging.exception('Put failed') flask.flash('Added case sucessfully.') return flask.redirect(flask.url_for('admin_config_update')) if case_form.errors: util.flash_errors(case_form) return flask.redirect(flask.url_for('admin_config_update'))
def compute_document_grades(pdfs_dir_name, country, uni, document): try: document_text = pdf2txt.convert_pdf_to_txt(pdfs_dir_name + "/" + country + "/" + uni + "/" + document) except (pdfminer.pdfparser.PDFSyntaxError, pdfminer.pdfdocument.PDFTextExtractionNotAllowed, TypeError): return (0,0) p = re.compile("([1-5]) *[:-]") document_text = document_text[::-1] grades = [m.group(1) for m, _ in zip(p.finditer(document_text), range(2))] if(len(grades) < 2): return (0,0) return (int(grades[1]), int(grades[0]))
def main(sum_type, startpath, fileList, destination, length): # Safe opening whether or not path exists # Taken from http://stackoverflow.com/a/600612/119527 def mkdir_p(path): """ Create directory if needed""" try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def safe_open_a(path): ''' Open "path" for writing, creating any parent directories as needed. ''' mkdir_p(os.path.dirname(path)) return open(path, 'a') def recursive_glob(rootdir='.', suffix=()): """ recursively traverses full path from route, returns paths and file names for files with suffix in tuple """ pathlist = [] filelist = [] for looproot,dirnames, filenames in os.walk(rootdir): for filename in filenames: if filename.endswith(suffix): pathlist.append(os.path.join(looproot, filename)) filelist.append(filename) return pathlist, filelist def valid_xml_char_ordinal(c): codepoint = ord(c) # conditions ordered by presumed frequency return ( 0x20 <= codepoint <= 0xD7FF or codepoint in (0x9, 0xA, 0xD) or 0xE000 <= codepoint <= 0xFFFD or 0x10000 <= codepoint <= 0x10FFFF ) def use_sumy(input, SENTENCES_COUNT, method, parser_option): """Code to run sumy # Supported summarization methods: # Luhn - heurestic method, reference # Edmundson heurestic method with previous statistic research, reference # Latent Semantic Analysis, LSA - one of the algorithm from http://scholar.google.com/citations?user=0fTuW_YAAAAJ&hl=en I think the author is using more advanced algorithms now. Steinberger, J. a Ježek, K. Using latent semantic an and summary evaluation. In In Proceedings ISIM '04. 2004. S. 93-100. # LexRank - Unsupervised approach inspired by algorithms PageRank and HITS, reference # TextRank - some sort of combination of a few resources that I found on the internet. I really don't remember the sources. Probably Wikipedia and some papers in 1st page of Google :)""" LANGUAGE = "english" #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) if parser_option == 'file': parser = PlaintextParser.from_file(input, Tokenizer(LANGUAGE)) elif parser_option == 'string': parser = PlaintextParser.from_string(input, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): summary.append(sentence) return summary def summarize_save(text, length, destination): """ summarized text string and saves in destination file""" possible_title = text.split('\n')[0] possible_title = ''.join(c for c in possible_title if valid_xml_char_ordinal(c)) summary = use_sumy(text, length, 'textrank', 'string') # with safe_open_a(destination) as f: # f.writelines('POSSIBLE TITLE: '+possible_title+'\n\n') # for line in summary: # f.writelines(str(line)+'\n\n') # f.writelines('\nFile Path: ' + path+'\n\n\n') # f.close() document.add_heading('Possible Title: '+possible_title, level=1) for line in summary: line = str(line) line = unicode(line, errors='ignore') clean = ''.join(c for c in line if valid_xml_char_ordinal(c)) document.add_paragraph(clean) path_string = path.replace("\\","/") document.add_paragraph('File Path: ' + path_string) if destination.endswith('.docx'): document.save(destination) else: document.save(destination+'.docx') fileTypes = ('.pdf', '.txt', '.docx', '.htm', '.html', 'htm', '.pptx') document = Document() if sum_type == 'directory': paths, files = recursive_glob(startpath, fileTypes) for idx, path in enumerate(paths): path = paths[idx] if files[idx].endswith('.pdf'): full_text = pdf2txt.convert_pdf_to_txt(path) elif files[idx].endswith('docx'): full_text = docx2txt.get_docx_text(path) else: full_text = textract.process(path) summarize_save(full_text, length, destination) else: for path in fileList: if path.endswith('.pdf'): full_text = pdf2txt.convert_pdf_to_txt(path) else: full_text = textract.process(path) summarize_save(full_text, length, destination)
for dirpath,dirnames,filenames in os.walk(path): filenames=filter(lambda filename:filename[-4:]=='.PDF',filenames) filenames=map(lambda filename:os.path.join(dirpath,filename),filenames) pdffilenames.extend(filenames) return pdffilenames names = getallpdffilename('pdf') data=[('company_name','report')] for name in names: print (name) for name in names: result = p2t.convert_pdf_to_txt(name) soup = BeautifulSoup(result) a_text_b = soup.get_text() print (len(a_text_b)) index_s = result.rfind ('第四节', 0, len(a_text_b)) print (index_s) index_e = result.rfind ('第五节', 0, len(a_text_b)) report = a_text_b[index_s:index_e] company_name = a_text_b[0:16] a = (company_name.decode('utf-8'),report.decode('utf-8')) data.append(a) print (sys.getdefaultencoding()) print (type(data))