Пример #1
0
def runall(path):
    docfile_list = getalldocfilename(path)  # 获取文件夹内所有文件的名字和路径
    pdffile_list = getallpdffilename(path)
    htmlfile_list = getallhtmlfilename(path)
    emlfile_list = getallemlfilename(path)

    # 将所有word文档转换为txt格式,引用封装的doc2txt模块
    for docfile in docfile_list:
        if docfile[-4:] == '.DOC':
            doc_file = docfile[:-4] + '.doc'
        else:
            doc_file = docfile
        d2t.convert_doc_to_txt(doc_file)

    # 将所有pdf文档转换为txt格式,引用封装的pdf2txt模块
    for pdffile in pdffile_list:
        # print pdffile
        pdf2t = pdffile[:-4]  # 以后缀.pdf切词
        f = open(pdf2t + '.txt', 'w+')  # 以txt格式保存
        f.write(p2t.convert_pdf_to_txt(pdffile))
        f.close()

    # 将所有html文档转换为txt格式
    for htmlfile in htmlfile_list:
        html2t = htmlfile[:-5]
        fout = open(html2t + '.txt', 'w')
        fin = open(htmlfile, 'r')
        strfile = fin.read()
        # print chardet.detect(strfile)
        # 文本格式的编码方式统一为utf-8
        if (chardet.detect(strfile)['encoding'] == 'GB2312'):
            str_file = h2t.html2text(
                strfile.decode("gbk", 'ignore').encode("utf-8", 'ignore'))
        if ((chardet.detect(strfile)['encoding'] == 'utf-8')
                or (chardet.detect(strfile)['encoding'] == 'UTF-8-SIG')):
            str_file = h2t.html2text(strfile)
        for t in str_file:
            txt = re.sub(r'[# * | -]?', '', t)  # drop #*
            fout.write(txt)
        fout.close()

    # 将所有email文档转换为txt格式
    for emlfile in emlfile_list:
        fp = open(emlfile, "r")
        msg = email.message_from_file(fp)  # 创建消息对象
        email2t = emlfile[:-4]
        fout = open(email2t + '.txt', 'w')
        emltext = 'content:{}'.format(e2t.convert_eml_to_txt(msg))
        # print chardet.detect(emltext)
        if (chardet.detect(emltext)['encoding'] == 'GB2312'):
            str_file = h2t.html2text(
                emltext.decode("gbk", 'ignore').encode("utf-8", 'ignore'))
        if ((chardet.detect(emltext)['encoding'] == 'utf-8')
                or (chardet.detect(strfile)['encoding'] == 'UTF-8-SIG')):
            str_file = h2t.html2text(emltext)
        print str_file
        for t in str_file:
            txt = re.sub(r'[# * | ]?', '', t)  # drop #*
            fout.write(txt)
        fout.close()
Пример #2
0
def handle_pdffiles(files):
    for pdffile in files:
        # print pdffile
        pdf2t = pdffile[:-4]  # 以后缀.pdf切词
        if (not os.path.exists(pdf2t + ".txt")):  # 判断是否存在,如果存在就不处理
            # print os.path.exists(pdf2t + ".txt")
            f = open(pdf2t + '.txt', 'w+')  # 以txt格式保存
            f.write(p2t.convert_pdf_to_txt(pdffile))
            f.close()
Пример #3
0
def admin_case_entry():
    allowed_extentions = set(['pdf'])

    def allowed_file(filename):
        return '.' in filename and \
               filename.rsplit('.', 1)[1] in allowed_extentions

    case_form = CaseEntryForm()
    if case_form.validate_on_submit():
        file = flask.request.files["file"]
        if not file:
            flask.flash('No file added.')
            return flask.redirect(flask.url_for('admin_config_update'))
        if not allowed_file(file.filename):
            flask.flash('File must be a pdf.')
            return flask.redirect(flask.url_for('admin_config_update'))

        case_data = file.read()
        file_name = secure_filename(file.filename)
        blob_io = files.blobstore.create(mime_type=file.content_type,
                                         _blobinfo_uploaded_filename=file_name)

        with files.open(blob_io, 'a') as f:
            f.write(case_data)

        files.finalize(blob_io)
        blob_key = files.blobstore.get_blob_key(blob_io)

        case = model.Case(reward=case_form.reward.data,
                          date_of_case=case_form.date.data,
                          case_file=blob_key)
        case.put()
        document = search.Document(fields=[
            search.TextField(name='case', value=convert_pdf_to_txt(file)),
            search.TextField(name='case_key', value=case.key.urlsafe())
        ])
        try:
            index = search.Index(name="Main")
            index.put(document)
            index = search.Index(name="Backup")
            index.put(document)
        except search.Error:
            logging.exception('Put failed')

        flask.flash('Added case sucessfully.')
        return flask.redirect(flask.url_for('admin_config_update'))

    if case_form.errors:
        util.flash_errors(case_form)
    return flask.redirect(flask.url_for('admin_config_update'))
Пример #4
0
def compute_document_grades(pdfs_dir_name, country, uni, document):

	try:
		document_text = pdf2txt.convert_pdf_to_txt(pdfs_dir_name + "/" + country + "/" + uni + "/" + document)
	except (pdfminer.pdfparser.PDFSyntaxError, pdfminer.pdfdocument.PDFTextExtractionNotAllowed, TypeError):
		return (0,0)

	p = re.compile("([1-5]) *[:-]")
	document_text = document_text[::-1]

	grades = [m.group(1) for m, _ in zip(p.finditer(document_text), range(2))]
	if(len(grades) < 2):
		return (0,0)

	return (int(grades[1]), int(grades[0]))
def main(sum_type, startpath, fileList, destination, length): 
    
    # Safe opening whether or not path exists
    # Taken from http://stackoverflow.com/a/600612/119527
    def mkdir_p(path):
        """ Create directory if needed"""
        try:
            os.makedirs(path)
        except OSError as exc: # Python >2.5
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else: raise
    
    def safe_open_a(path):
        ''' Open "path" for writing, creating any parent directories as needed.
        '''
        mkdir_p(os.path.dirname(path))
        return open(path, 'a')
    
    def recursive_glob(rootdir='.', suffix=()):
        """ recursively traverses full path from route, returns
            paths and file names for files with suffix in tuple """
        pathlist = []
        filelist = []
        for looproot,dirnames, filenames in os.walk(rootdir):
            for filename in filenames:
                if filename.endswith(suffix):
                    pathlist.append(os.path.join(looproot, filename))
                    filelist.append(filename)
        return pathlist, filelist
        
    def valid_xml_char_ordinal(c):
        codepoint = ord(c)
        # conditions ordered by presumed frequency
        return (
            0x20 <= codepoint <= 0xD7FF or
            codepoint in (0x9, 0xA, 0xD) or
            0xE000 <= codepoint <= 0xFFFD or
            0x10000 <= codepoint <= 0x10FFFF
            )

    def use_sumy(input, SENTENCES_COUNT, method, parser_option):
        """Code to run sumy
        # Supported summarization methods:
        #    Luhn - heurestic method, reference
        #    Edmundson heurestic method with previous statistic research, reference
        #    Latent Semantic Analysis, LSA - one of the algorithm from http://scholar.google.com/citations?user=0fTuW_YAAAAJ&hl=en I think the author is using more advanced algorithms now. Steinberger, J. a Ježek, K. Using latent semantic an and summary evaluation. In In Proceedings ISIM '04. 2004. S. 93-100.
        #    LexRank - Unsupervised approach inspired by algorithms PageRank and HITS, reference
        #    TextRank - some sort of combination of a few resources that I found on the internet. I really don't remember the sources. Probably Wikipedia and some papers in 1st page of Google :)"""
        LANGUAGE = "english"
        #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        if parser_option == 'file':
            parser = PlaintextParser.from_file(input, Tokenizer(LANGUAGE))
        elif parser_option == 'string':
            parser = PlaintextParser.from_string(input, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        summary = []
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summary.append(sentence)
        return summary
          
    def summarize_save(text, length, destination):   
        """ summarized text string and saves in destination file"""
        possible_title = text.split('\n')[0]
        possible_title = ''.join(c for c in possible_title if valid_xml_char_ordinal(c))
    
        summary = use_sumy(text, length, 'textrank', 'string')
#        with safe_open_a(destination) as f:
#            f.writelines('POSSIBLE TITLE: '+possible_title+'\n\n')
#            for line in summary:
#                f.writelines(str(line)+'\n\n')
#            f.writelines('\nFile Path: ' + path+'\n\n\n')
#            f.close()
        document.add_heading('Possible Title: '+possible_title, level=1)   
        for line in summary:
            line = str(line)
            line = unicode(line, errors='ignore')
            clean = ''.join(c for c in line if valid_xml_char_ordinal(c))
            document.add_paragraph(clean)
        path_string = path.replace("\\","/")
        document.add_paragraph('File Path: ' + path_string)
        if destination.endswith('.docx'):
            document.save(destination)
        else:
            document.save(destination+'.docx')

    fileTypes = ('.pdf', '.txt', '.docx', '.htm', '.html', 'htm', '.pptx')
    document = Document()
    if sum_type == 'directory':
        paths, files = recursive_glob(startpath, fileTypes)
        for idx, path in enumerate(paths):
            path = paths[idx]
            if files[idx].endswith('.pdf'):
                full_text = pdf2txt.convert_pdf_to_txt(path)
            elif files[idx].endswith('docx'):
                full_text = docx2txt.get_docx_text(path)
            else:
                full_text = textract.process(path)
            summarize_save(full_text, length, destination) 
    else:
        for path in fileList:
            if path.endswith('.pdf'):
                full_text = pdf2txt.convert_pdf_to_txt(path)
            else:
                full_text = textract.process(path)
            summarize_save(full_text, length, destination) 
Пример #6
0
    for dirpath,dirnames,filenames in os.walk(path):
        filenames=filter(lambda filename:filename[-4:]=='.PDF',filenames)
        filenames=map(lambda filename:os.path.join(dirpath,filename),filenames)
        pdffilenames.extend(filenames)
    return pdffilenames

names = getallpdffilename('pdf')

data=[('company_name','report')]


for name in names: 
	print (name)

for name in names:
	result = p2t.convert_pdf_to_txt(name)
        soup = BeautifulSoup(result)
        a_text_b = soup.get_text()
        print (len(a_text_b))
	index_s = result.rfind ('第四节', 0, len(a_text_b))
        print (index_s)
	index_e = result.rfind ('第五节', 0, len(a_text_b))
	report = a_text_b[index_s:index_e]
	company_name = a_text_b[0:16]
        a = (company_name.decode('utf-8'),report.decode('utf-8'))
        data.append(a) 

print (sys.getdefaultencoding())

print (type(data))