def convert_to_text(fname):
    pages = None
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close

    text_list = text.split('\n')
    txt = text_list[:3]
    text = ' '.join(text_list[3:])

    print("###################")
    print(txt)

    ## spliting word from string

    word_list = text.split(' ')
    string_input = ""
    flag = 0
    for word in word_list:
        # print("*********")
        # print(word)

        if (word.lower() == 'tran'):
            break
        else:
            if (word.lower() == 'customer' or word.lower() == 'scheme'
                    or word.lower() == 'currency' or word.lower() == 'for'):
                word = '\n' + word

            elif (word.lower() == 'statement'):
                word = '\n' + word
                flag = 1
            elif (word.lower() == 'account' and flag == 1):
                word = '\n' + word

        string_input += word + " "
    print("::::::::::::::::::::::")
    # print(string_input)

    file_name = fname.split('/')[-1]
    file_name = file_name.split('.')[0]
    # print(file_name)

    # write Content to .txt
    text_file = open(
        "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/txt/output_"
        + file_name + ".txt", "w")
    text = re.sub("\s\s+", " ", text)

    text_file.write("%s" % text)
    text_file.close()
    file_name_main = "output_" + file_name + ".csv"
    csv_file = open(
        "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/csv/" +
        file_name_main, "w")
    text = re.sub("\s\s+", " ", string_input)
    csv_file.write("%s" % string_input)
    csv_file.close()
    length_lines = len(string_input.split('\n'))
    # print("-----------",length_lines)
    convert_to_table(fname, string_input, txt)
示例#2
0
def extract_text_from_pdf():
    file = os.listdir('D:\\Data\\send\\')
    files_name = ''
    for item in file:
        if re.search(str(), item):
            files_name = 'D:\\Data\\send\\' + item
            break
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    count = 0
    with open(files_name, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
            count += 1
            if count == 3:
                break

        text = fake_file_handle.getvalue()
    converter.close()
    fake_file_handle.close()

    #def patterns1():
    p1 = re.compile(r'([а-я])([А-Я])')
    p2 = re.compile(r'([А-Я]{7})([а-я])')
    p3 = re.compile(r'([A-я])([0-9])')
    p4 = re.compile(r'([0-9])([A-я])')
    p5 = re.compile(r'(\w)(№)')
    p6 = re.compile(r'([.!?)])([A-я])')
    p7 = re.compile(r'(\d{2}.\d{2}.\d{4})([A-я])')
    p8 = re.compile(r'([A-я])(\d{2}.\d{2}.\d{4})')
    p9 = re.compile(r'([,])([A-я])')
    p10 = re.compile(r'([А-я]")([A-я])')
    #def no_CaMeLs():
    text2 = re.sub(p1, r"\1&\2", text)
    text3 = re.sub(p2, r"\1 \2", text2)
    text4 = re.sub(p3, r"\1&\2", text3)
    text5 = re.sub(p4, r"\1&\2", text4)
    text6 = re.sub(p5, r"\1 \2", text5)
    text7 = re.sub(p6, r"\1&\2", text6)
    text8 = re.sub(p7, r"\1 \2", text7)
    text9 = re.sub(p8, r"\1 \2", text8)
    text10 = re.sub(p9, r"\1 \2", text9)
    text11 = re.sub(p10, r'\1&\2', text10)
    #def patterns2():
    p_vipiska_date = re.compile(
        r'(.+)(ИСКА из Единого государственного реестра юридических лиц&)(\d{2}.\d{2}.\d{4})(.+)'
    )
    p_number_date = re.compile(r'(.+)(№.+)(&дата формирования выписки)(.+)')
    p_full_title = re.compile(
        r'(.+)(Настоящая выписка содержит сведения о юридическом лице&)([А-я\s"]+)(.+)'
    )
    p_inn = re.compile(
        r'(.+)(Сведения об учете в налоговом органе&\d\d&ИНН&)([0-9]+)(\d\d&КПП&)(.+)'
    )
    p_ogrn = re.compile(
        r'(.+)(полное наименование юридического лица&ОГРН&)([0-9]+)(.+)')
    p_name = re.compile(r'(.+)(Сокращенное наименование&)([А-я\s"]+)(.+)')
    p_date_egrul = re.compile(
        r'(.+)(3&ГРН и дата внесения в ЕГРЮЛ записи, содержащей указанные сведения&)([0-9\.]+)(\d{2}.\d{2}.\d{4})(&Адрес )(.+)'
    )
    p_index = re.compile(
        r'(.+)(4&Почтовый индекс&)([0-9]{6})(5&Субъект Российской Федерации&)(.+)'
    )
    p_subject_RF = re.compile(
        r'(.+)(5&Субъект Российской Федерации&)([А-я\s]+)(&6&Улица )(.+)')
    p_street = re.compile(
        r'(.+)(проспект, переулок и т\.&д\..&)(.+)(&7&Дом )(.+)')
    p_house = re.compile(r'(.+)(&ДОМ )([0-9]+)(&Корпус )(.+)')
    p_corpus = re.compile(r'(.+)(&СТРОЕНИЕ )([0-9]+)(&Офис )(.+)')
    p_flat = re.compile(r'(.+)(&)(.+)(10&ГРН и дата внесения в ЕГРЮЛ)(.+)')
    p_date_egrul2 = re.compile(
        r'(.+)(10&ГРН и дата внесения в ЕГРЮЛ записи, содержащей указанные сведения&)([0-9\.]+)(\d{2}.\d{2}.\d{4})(&Сведения о регистрации&11&)(.+)'
    )
    p_registration = re.compile(
        r'(.+)(11&Способ образования&)([А-я\s]+)(&12&ОГРН)(.+)')
    p_date_registration = re.compile(
        r'(.+)(13&Дата регистрации&)(\d{2}.\d{2}.\d{4})(14&ГРН и дата внесения в ЕГРЮЛ записи, содержащей указанные сведения&)(.+)'
    )
    p_date_egrul3 = re.compile(
        r'(.+)(14&ГРН и дата внесения в ЕГРЮЛ записи, содержащей указанные сведения&)([0-9\.]+)(\d{2}.\d{2}.\d{4})(&Сведения о регистрирующем органе по месту нахождения юридического лица&15)(.+)'
    )
    p_kpp = re.compile(r'(.+)(&КПП&)([0-9]+)(&Дата постановки на учет&)(.+)')
    p_date_inn = re.compile(
        r'(.+)(&Дата постановки на учет&)(\d{2}.\d{2}.\d{4})(21&Наименование налогового органа&)(.+)'
    )
    p_tax_office = re.compile(
        r'(.+)(&Наименование налогового органа&)(.+)(22&ГРН и дата внесения в ЕГРЮЛ записи)(.+)'
    )
    p_capital = re.compile(r'(.+)(31&Вид&)(.+)(&32&Размер )(.+)')
    p_capital2 = re.compile(
        r'(.+)(&32&Размер .в рублях.)(.+)(33&ГРН и дата внесения в ЕГРЮЛ)(.+)')
    p_surname = re.compile(r'(.+)(35&Фамилия&)(.+)(&36&Имя&)(.+)')
    p_name_gener = re.compile(r'(.+)(&36&Имя&)(.+)(&37&Отчество&)(.+)')
    p_patronymic = re.compile(r'(.+)(&37&Отчество&)(.+)(&38&ИНН&)(.+)')
    p_inn_gener = re.compile(
        r'(.+)(&38&ИНН&)(.+)(39&ГРН и дата внесения в ЕГРЮЛ записи,)(.+)')
    p_position = re.compile(
        r'(.+)(40&Должность&)(.+)(&41&ГРН и дата внесения)(.+)')
    p_founder = re.compile(
        r'(.+)(43&Полное наименование&)(.+)(&44&ГРН и дата внесения)(.+)')
    p_fou_country = re.compile(
        r'(.+)(45&Страна происхождения&)(.+)(&46&Дата регистрации&)(.+)')
    p_fou_address = re.compile(
        r'(.+)(&49&Адрес .место нахождения. в странепроисхождения&)(.+)(&50&ГРН и дата внесения )(.+)'
    )
    p_fou_capital = re.compile(
        r'(.+)(51&Номинальная стоимость доли .в рублях.)(.+)(52&Размер доли)(.+)'
    )
    p_percent = re.compile(
        r'(.+)(52&Размер доли .в процентах.)(.+)(53&ГРН и дата внесения в ЕГРЮЛ записи)(.+)'
    )
    p_activity = re.compile(
        r'(.+)(&54&Код и наименование вида деятельности&)(.+)(&55&ГРН и дата внесения в ЕГРЮЛ записи)(.+)'
    )

    #def stuff_docx():
    vipiska_date = re.sub(p_vipiska_date, r'\3', text11)
    vipiska_number = re.sub(p_number_date, r'\2', text11)
    full_title = re.sub(p_full_title, r'\3', text11)
    inn = re.sub(p_inn, r'\3', text11)
    ogrn = re.sub(p_ogrn, r'\3', text11)
    name = re.sub(p_name, r'\3', text11)
    date_egrul = re.sub(p_date_egrul, r'\3 \4', text11)
    index = re.sub(p_index, r'\3', text11)
    subject_RF = re.sub(p_subject_RF, r'\3', text11)
    street = re.sub(p_street, r'\3', text11)
    house = re.sub(p_house, r'\3', text11)
    corpus = re.sub(p_corpus, r'\3', text11)
    flat = re.sub(p_flat, r'\3', text11)
    date_egrul2 = re.sub(p_date_egrul2, r'\3 \4', text11)
    registration = re.sub(p_registration, r'\3', text11)
    date_registration = re.sub(p_date_registration, r'\3', text11)
    date_egrul3 = re.sub(p_date_egrul3, r'\3 \4', text11)
    kpp = re.sub(p_kpp, r'\3', text11)
    date_inn = re.sub(p_date_inn, r'\3', text11)
    tax_office = re.sub(p_tax_office, r'\3', text11)
    capital = re.sub(p_capital, r'\3', text11)
    capital2 = re.sub(p_capital2, r'\3', text11)
    surname = re.sub(p_surname, r'\3', text11)
    name_gener = re.sub(p_name_gener, r'\3', text11)
    patronymic = re.sub(p_patronymic, r'\3', text11)
    inn_gener = re.sub(p_inn_gener, r'\3', text11)
    position = re.sub(p_position, r'\3', text11)
    founder = re.sub(p_founder, r'\3', text11)
    fou_country = re.sub(p_fou_country, r'\3', text11)
    fou_address = re.sub(p_fou_address, r'\3', text11)
    fou_capital = re.sub(p_fou_capital, r'\3', text11)
    percent = re.sub(p_percent, r'\3', text11)
    activity = re.sub(p_activity, r'\3', text11)
    print(text11)

    #def make_docx():
    doc = DocxTemplate("D:\\Data\\document\\выписка.docx")
    context = {
        'framing_date': vipiska_date,
        'vipiska_number': vipiska_number,
        'full_title': full_title,
        'inn': inn,
        'ogrn': ogrn,
        'name': name,
        'date_egrul': date_egrul,
        'index': index,
        'subject_RF': subject_RF,
        'street': street,
        'house': house,
        'corpus': corpus,
        'flat': flat,
        'date_egrul2': date_egrul2,
        'registration': registration,
        'date_registration': date_registration,
        'date_egrul3': date_egrul3,
        'kpp': kpp,
        'date_inn': date_inn,
        'tax_office': tax_office,
        'capital': capital,
        'capital2': capital2,
        'surname': surname,
        'name_gener': name_gener,
        'patronymic': patronymic,
        'inn_gener': inn_gener,
        'position': position,
        'founder': founder,
        'fou_country': fou_country,
        'fou_address': fou_address,
        'fou_capital': fou_capital,
        'percent': percent,
        'activity': activity
    }

    doc.render(context)
    doc.save('D:\\Data\\document\\выписка_' + str(inn) + '.docx')
示例#3
0
def pdf2txt(argv):
    import getopt
    (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr,
                    device,
                    fp,
                    pagenos,
                    maxpages=maxpages,
                    password=password,
                    caching=caching,
                    check_extractable=True)
        fp.close()
    device.close()
    outfp.close()
    return
示例#4
0
def generate_pdf_pages(fp, maxpages=0, logger=None):

    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdevice import PDFDevice
    from pdfminer.cmapdb import CMapDB
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams

    import re

    password = ''
    pagenos = set()
    imagewriter = None
    rotation = 0
    caching = True

    laparams = LAParams()

    #debug = 0
    #PDFDocument.debug = debug
    #PDFParser.debug = debug
    #CMapDB.debug = debug
    #PDFResourceManager.debug = debug
    #PDFPageInterpreter.debug = debug
    #PDFDevice.debug = debug
    #

    rsrcmgr = PDFResourceManager(caching=caching)

    pages = []

    for i, page in enumerate(
            PDFPage.get_pages(fp,
                              pagenos,
                              maxpages=maxpages,
                              password=password,
                              caching=caching,
                              check_extractable=True), 1):

        # page.rotate = (page.rotate + rotation) % 360

        outfp = StringIO()

        outfp.write('{} ===========================\n'.format(i))

        device = TextConverter(rsrcmgr,
                               outfp,
                               codec='utf-8',
                               laparams=laparams,
                               imagewriter=imagewriter)

        interpreter = PDFPageInterpreter(rsrcmgr, device)

        interpreter.process_page(page)

        if logger:
            logger.info("Processing page: {}".format(i))

        device.close()

        r = outfp.getvalue()

        outfp.close()

        pages.append(re.sub(r'[ ]+', ' ',
                            r))  # Get rid of all of those damn spaces.

    fp.close()

    return pages
示例#5
0
def processPdf(pdf_filenames,d,i):
    i=len(d)+1
    print("in process pdf   "+str(d)+"  value i "+str(i))
    for file in pdf_filenames:
        path=file
        rsrcmgr = PDFResourceManager()
        retstr = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = open(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
            interpreter.process_page(page)

            text = retstr.getvalue()

        fp.close()
        device.close()
        retstr.close()

        matches = list(datefinder.find_dates(text))
        pattern = re.findall(r'([£$€$₹])[\s]?(\d+(?:\.\d{2})?)', text)
        print(matches)
        print(pattern)
        if 'Ola' in text:
            d[i].append('taxi')
            d[i].append('OOLA')
            d[i].append(str(matches[3].date()))
            if '₹' in pattern[0][0]:
                d[i].append('INR')
                d[i].append(pattern[0][1])
            elif '$' in pattern[0][0]:
                d[i].append('USD')
                d[i].append(pattern[0][1])
            elif '€' in pattern[0][0]:
                d[i].append('USD')
                d[i].append(pattern[0][1])
        elif 'Uber' in text:
            d[i].append('taxi')
            d[i].append('UBER')
            d[i].append(str(matches[3].date()))
            if '₹' in pattern[0][0]:
                d[i].append('INR')
                d[i].append(pattern[0][1])
            elif '$' in pattern[0][0]:
                d[i].append('USD')
                d[i].append(pattern[0][1])
            elif '€' in pattern[0][0]:
                d[i].append('USD')
                d[i].append(pattern[0][1])
        elif 'Invoice' in text:
            d[i].append('flight')
            d[i].append('international')
            d[i].append(str(matches[0].date()))
            if '₹' in pattern[0][0]:
                d[i].append('INR')
                d[i].append(pattern[0][1])
            elif '$' in pattern[0][0]:
                d[i].append('USD')
                d[i].append(pattern[0][1])
            elif '€' in pattern[0][0]:
                d[i].append('USD')
                d[i].append(pattern[0][1])
        else:
            d[i].append('flight')
            d[i].append('domestic')
            d[i].append(str(matches[3].date()))
            d[i].append('INR')
            words = text.split("\n")
            if 'Total Fare' in words:
                val = words.index('Total Fare') + 1
                d[i].append(words[val])
        d[i].append(file)
        i += 1
    print("after all "+str(d))
    return d
示例#6
0
    def convert(self):
        self.ui.progressBar.setValue(0)
        if not self.pages:
            pagenums = set()
        else:
            pagenums = set(self.pages)

        output = StringIO()
        manager = PDFResourceManager()
        converter = TextConverter(manager, output, laparams=LAParams())
        interpreter = PDFPageInterpreter(manager, converter)

        infile = file(self.fname, 'rb')
        for page in PDFPage.get_pages(infile, pagenums):
            interpreter.process_page(page)
        infile.close()
        converter.close()
        data = output.getvalue()
        output.close

        #print(data)

        style1 = xlwt.easyxf('font: bold off; align: wrap off, vert centre, horiz center; borders: top thin, bottom thin, left thin, right thin;')
        style1.num_format_str = 'DD-MM-YY'
        style2 = xlwt.easyxf('font: bold off; align: wrap off, vert centre, horiz center; borders: top thin, bottom thin, left thin, right thin;')
        style2.num_format_str = 'HH:MM'
        style4 = xlwt.easyxf('font: bold off; align: wrap off, vert centre, horiz center; borders: top thin, bottom thin, left thin, right thin;')
        style4.num_format_str = 'HH:MM:SS'
        style0 = xlwt.easyxf('font: bold off; align: wrap off, vert centre, horiz left; borders: top thin, bottom thin, left thin, right thin;')
        style3 = xlwt.easyxf('font: bold on; align: wrap off, vert centre, horiz center; borders: top double, bottom double, left double, right double;')
        style5 = xlwt.easyxf('font: bold on; align: wrap off, vert centre, horiz center; borders: top double, bottom double, left double, right double;')
        style5.num_format_str = '[h]:mm:ss;@'

        wb = xlwt.Workbook()
        ws = wb.add_sheet('A Test Sheet')
        ws.write(3,0,'Day',style3)
        ws.write(3,1,'Date',style3)
        ws.write(3,2,'Time',style3)
        ws.write(3,3,'E/Stn',style3)
        ws.write(3,4,'Service Name',style3)
        ws.write(3,5,'Destination',style3)
        ws.write(3,6,'Code',style3)
        ws.write(3,7,'Band',style3)
        ws.write(3,8,'Peak/Off-Peak',style3)
        ws.write(3,9,'Amount',style3)
        ws.write(3,10,'Unit',style3)
        ws.write(3,11,'Cost',style3)
        ws.write(3,12,'Tarif',style3)
        ws.write(3,13,'Cost_2',style3)

        i = 4

        self.ui.progressBar.setValue(10)
        #regDate = '\d\d-\d\d-\d\d'
        regDate = str(self.ui.lineEdit_2.text())
        matchesDate = re.findall(regDate, data)
        #print(matchesDate)
        #print(len(matchesDate))
        for Date in matchesDate:
            #outfp.write(Date+'\n')
            ws.write(i,1,Date,style1)
            i +=1

        i = 4

        #regDay = '\n(Tue|Wed|Fri|Thu|Mon|Sat|Sun)'
        regDay = str(self.ui.lineEdit_3.text())
        matchesDay = re.findall(regDay, data)
        for Day in matchesDay:
            #outfp.write(Day+'\n')
            ws.write(i,0,Day,style0)
            i +=1

        i = 4
        k = 0

        self.ui.progressBar.setValue(20)
        #regTime = '[^\n:]\d\d:\d\d'
        regTime = str(self.ui.lineEdit_4.text())
        matchesTime = re.findall(regTime, data)
        #print(matchesTime)
        #print(len(matchesTime))
        while k < len(matchesDate):
            #outfp.write(Time+'\n')
            ws.write(i,2,matchesTime[k],style2)
            k+=1
            i+=1

        i = 4
        k = 0

        #regService = 'Voice'
        regService = str(self.ui.lineEdit_5.text())
        matchesService = re.findall(regService, data)
        #print(matchesService)
        #print(len(matchesService))
        while k < len(matchesDate):
            #outfp.write(Day+'\n')
            ws.write(i,4,matchesService[k],style0)
            i+=1
            k+=1

        i = 4
        k = 0

        self.ui.progressBar.setValue(30)
        trueDest = []
        truedest2 = []
        regDest = str(self.ui.lineEdit_6.text())
        #regDest = '\d{7,}'
        matchesDest = re.findall(regDest, data)
        ws.write(0,0,'Mobile No',style3)
        ws.write(0,1,matchesDest[0],style3)
        #print(matchesDest)
        #print(len(matchesDest))
        for Dest in matchesDest:
            if Dest != matchesDest[0]:
                #outfp.write(Day+'\n')
                trueDest.append(Dest)
        while k < len(matchesDate):
            ws.write(i,5,trueDest[k],style0)
            truedest2.append(trueDest[k][0:4])
            i+=1
            k+=1

        #print(len(truedest2))
        #print(truedest2)
        i = 4
        k = 0

        self.ui.progressBar.setValue(40)
        regCode = str(self.ui.lineEdit_7.text())
        #regCode = r'\bL\b'
        matchesCode = re.findall(regCode, data)
        #print(matchesCode)
        #print(len(matchesCode))
        while k < len(matchesDate):
            #outfp.write(Day+'\n')
            ws.write(i,6,matchesCode[k],style0)
            i+=1
            k+=1

        i = 4
        k = 0

        self.ui.progressBar.setValue(50)
        regBand = str(self.ui.lineEdit_8.text())
        #regBand = r'\b(1|2|3)\b\n'
        matchesBand = re.findall(regBand, data)
        #print(matchesBand)
        #print(len(matchesBand))
        while k < len(matchesDate):
            #outfp.write(Day+'\n')
            ws.write(i,7,matchesBand[k],style0)
            i+=1
            k+=1

        i = 4
        k = 0

        self.ui.progressBar.setValue(60)
        regPeak = str(self.ui.lineEdit_9.text())
        #regPeak = r'\bO-P\b'
        matchesPeak = re.findall(regPeak, data)
        #print(matchesPeak)
        #print(len(matchesPeak))
        while k < len(matchesDate):
            #outfp.write(Day+'\n')
            ws.write(i,8,matchesPeak[k],style0)
            i+=1
            k+=1

        i = 4
        k = 0

        self.ui.progressBar.setValue(70)
        regAmount = str(self.ui.lineEdit_10.text())
        #regAmount = r'\b00:\d\d:\d\d\b'
        matchesAmount = re.findall(regAmount, data)
        #print(matchesAmount)
        #print(len(matchesAmount))
        while k < len(matchesDate):
            #outfp.write(Day+'\n')
            hms = matchesAmount[k].split(':')
            ws.write(i,9,datetime.time(int(hms[0]),int(hms[1]),int(hms[2])),style4)
            i+=1
            k+=1

        i = 4

        regUnit = str(self.ui.lineEdit_11.text())
        #regUnit = r'\bH:M:S\b'
        matchesUnit = re.findall(regUnit, data)
        #print(matchesUnit)
        #print(len(matchesUnit))
        for Unit in matchesUnit:
            #outfp.write(Day+'\n')
            ws.write(i,10,Unit,style0)
            i+=1

        i = 4
        k = 0

        self.ui.progressBar.setValue(80)
        regCost = str(self.ui.lineEdit_12.text())
        #regCost = r'\b\d+,\d+\b'
        matchesCost = re.findall(regCost, data)
        #print(matchesCost)
        #print(len(matchesCost))
        while k < len(matchesDate):
            #outfp.write(Day+'\n')
            ws.write(i,11,float(matchesCost[k].replace(',','.')),style0)
            i+=1
            k+=1

        i = 4
        k = 0

        self.ui.progressBar.setValue(90)
        tarif = {}
        try:
            tariff = file('tarif.txt', 'r')
            tarif = pickle.load(tariff)
            tariff.close()
            #print(tarif)
        except: pass
        while k < len(matchesDate):
            if tarif.has_key(truedest2[k]):
                #outfp.write(Day+'\n')
                ws.write(i,12,tarif[truedest2[k]],style0)
            else:
                tarifinput = QInputDialog.getText(self,'Tarif','Please enter tarif for '+truedest2[k]+':')
                tarif[truedest2[k]] = float(tarifinput[0])
                ws.write(i,12,tarif[truedest2[k]],style0)

            ws.write(i,13,xlwt.Formula('(HOUR(J'+str(i+1)+')*60+MINUTE(J'+str(i+1)+')+SECOND(J'+str(i+1)+')/60)*M'+str(i+1)+''),style0)
            i+=1
            k+=1


        tariff = file('tarif.txt', 'w')
        pickle.dump(tarif, tariff)
        tariff.close()

        ws.write(len(matchesDate)+6,0,'Total:',style3)
        ws.write(len(matchesDate)+6,9,xlwt.Formula('SUM(J5:J'+str(len(matchesDate)+4)+')'),style5)
        ws.write(len(matchesDate)+6,11,xlwt.Formula('SUM(L5:L'+str(len(matchesDate)+4)+')'),style3)
        ws.write(len(matchesDate)+6,13,xlwt.Formula('SUM(N5:N'+str(len(matchesDate)+4)+')'),style3)


        #outfp.close
        wb.save(self.fname+'.xls')
        self.ui.progressBar.setValue(100)
        if self.ui.checkBox.isChecked():
            os.startfile(str(self.fname)+'.xls',)
        return data
示例#7
0
    def run(filepath):
        import getopt

        # debug option
        debug = 0
        # input option
        password = ''
        pagenos = set()
        maxpages = 0
        # output option
        outfile = 'pdfparser/minute_store/minutes.xml'
        outtype = 'xml'
        imagewriter = None
        rotation = 0
        stripcontrol = False
        layoutmode = 'normal'
        codec = 'utf-8'
        pageno = 1
        scale = 1
        caching = True
        showpageno = True
        laparams = LAParams()

        #
        PDFDocument.debug = debug
        PDFParser.debug = debug
        CMapDB.debug = debug
        PDFPageInterpreter.debug = debug
        #
        rsrcmgr = PDFResourceManager(caching=caching)

        if outfile:
            outfp = open(outfile, 'wb')
        else:
            outfp = sys.stdout
        if outtype == 'text':
            device = TextConverter(rsrcmgr,
                                   outfp,
                                   codec=codec,
                                   laparams=laparams,
                                   imagewriter=imagewriter)
        elif outtype == 'xml':
            device = XMLConverter(rsrcmgr,
                                  outfp,
                                  codec=codec,
                                  laparams=laparams,
                                  imagewriter=imagewriter,
                                  stripcontrol=stripcontrol)

        fp = open(filepath, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()

        device.close()
        outfp.close()
        print("Converted PDF to XML")
        return
示例#8
0
def get_text_from_pdf(pdfname, limit=1000):
    if (pdfname == ''):
        return ''
    else:
        # 処理するPDFファイルを開く/開けなければ
        try:
            fp = open(pdfname, 'rb')
        except:
            return ''

    # PDFからテキストの抽出
    rsrcmgr = PDFResourceManager()
    out_fp = StringIO()
    la_params = LAParams()
    la_params.detect_vertical = True
    device = TextConverter(rsrcmgr, out_fp, codec='utf-8', laparams=la_params)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp,
                                  pagenos=None,
                                  maxpages=0,
                                  password=None,
                                  caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)
    text = out_fp.getvalue()
    fp.close()
    device.close()
    out_fp.close()

    # 改行で分割する
    #lines = text.splitlines()
    lines = []
    lines.append(text)

    outputs = []
    output = ""

    # 除去するutf8文字
    replace_strs = [b'\x00']

    is_blank_line = False

    # 分割した行でループ
    for line in lines:

        # byte文字列に変換
        line_utf8 = line.encode('utf-8')

        # 余分な文字を除去する
        for replace_str in replace_strs:
            line_utf8 = line_utf8.replace(replace_str, b'')

        # strに戻す
        line = line_utf8.decode()

        # 連続する空白を一つにする
        line = re.sub("[ ]+", " ", line)

        # 前後の空白を除く
        line = line.strip()
        #print("aft:[" + line + "]")

        # 空行は無視
        if len(line) == 0:
            is_blank_line = True
            continue

        # 数字だけの行は無視
        if is_float(line):
            continue

        # 1単語しかなく、末尾がピリオドで終わらないものは無視
        if line.split(" ").count == 1 and not line.endswith("."):
            continue

        # 文章の切れ目の場合
        if is_blank_line or output.endswith("."):
            # 文字数がlimitを超えていたらここで一旦区切る
            if (len(output) > limit):
                outputs.append(output)
                output = ""
            else:
                output += "\r\n"
        #前の行からの続きの場合
        elif not is_blank_line and output.endswith("-"):
            output = output[:-1]
        #それ以外の場合は、単語の切れ目として半角空白を入れる
        else:
            output += " "

        #print("[" + str(line) + "]")
        output += str(line)
        is_blank_line = False

    outputs.append(output)
    outputs.append('\n')
    return outputs
示例#9
0
# setting absolute paths
prjPath = r'C:\Users\Natarajan\Desktop\PDFParser'


def convert(case, fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    manager = PDFResourceManager()
    codec = 'utf-8'
    caching = True

    if case == 'text':
        output = io.StringIO()
        converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
    if case == 'HTML':
        output = io.BytesIO()
        converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams())

    interpreter = PDFPageInterpreter(manager, converter)
    infile = open(fname, 'rb')

    for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()

    infile.close()
    converter.close()
    output.close()
示例#10
0
#pip install pdf miner
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter, HTMLConverter, XMLConverter
from pdfminer.layout import LAParams
import io

pdf_path = 'C:\\Users\somepath\filename.pdf'  #path to your pdf file

pdf = open(pdf_path, 'rb')
mem = io.StringIO()

lp = LAParams()
rm = PDFResourceManager()
cnv = TextConverter(rm, mem, laparams=lp)
ip = PDFPageInterpreter(rm, cnv)

for i in PDFPage.get_pages(pdf):
    ip.process_page(i)
    text = mem.getvalue()

file = open("F:\\AIB\\covertedtext.txt", 'wb')  #path to your destination file
file.write(text.encode('utf-8'))

print("DONE")
def extract_text_from_pdf(pdf_files):
    # resource_manager = PDFResourceManager()
    # fake_file_handle = io.StringIO()
    # converter = TextConverter(resource_manager, fake_file_handle)
    # page_interpreter = PDFPageInterpreter(resource_manager, converter)

    for PDF_file in pdf_files:

        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        with open(PDF_file, 'rb') as fh:
            for page in PDFPage.get_pages(fh,
                                          caching=True,
                                          check_extractable=True):
                page_interpreter.process_page(page)

            text = fake_file_handle.getvalue()
        # close open handles

        # file = open("text1.txt")
        output_file = PDF_file[:-4]
        output_file_name = os.path.join(output_file + ".txt")
        f = open(output_file_name, "a")
        for line in text.split(". "):
            if ("GOODWILL AND OTHER INTANGIBLE ASSETS" in line.upper()):
                # print(line)
                f.write(line)
                f.write("\n")
            if ("GOODWILL AND OTHER" in line.upper()):
                # print(line)
                f.write(line)
                f.write("\n")
            if ("ACQUISITION" in line.upper()):
                # print(line)
                f.write(line)
                f.write("\n")
            if ("BUSINESS COMBINATION" in line.upper()):
                # print(line)
                f.write(line)
                f.write("\n")
            if ("DIVESTITURE" in line.upper()):
                # print(line)
                f.write(line)
                f.write("\n")
            if ("GOODWILL AND OTHER" in line.upper()):
                # print(line)
                f.write(line)
                f.write("\n")
            if ("ACQUISITIONS" in line.upper()):
                # print(line)
                f.write(line)
                f.write("\n")

        f.close()
        # if text:
        #     return text
        converter.close()
        fake_file_handle.close()
        text = ""
def get_report_startpage(pdf):
    """获取财务报表在文件内的起始页
    Arguments:
        pdf {[str]} -- [pdf文件路径]
    Returns:
        start_page[int] -- [业务报表的起始页]
    """
    getp= pdfplumber.open(pdf)
    total=len(getp.pages)
    #用于判断当前页是否在前10页
    count=0
    #存储报表的起始页
    start_page=1
    #是否是年度报告之类的文件标志
    flag=False
    #创建一个pdf资源管理对象,存储共享资源
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    codec = 'utf-8'
    outfp = StringIO()
    #创建device对象
    device = TextConverter(rsrcmgr=rsrcmgr, outfp=outfp, codec=codec, laparams=laparams)
    if total>30:
        print('总页数',total)
        with open(pdf, 'rb') as fp:
            #处理页面内容
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos=set()
            #遍历pdf中的每一页
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
                count+=1
                teststr=''
                interpreter.process_page(page)
                teststr=outfp.getvalue()
              
                #第一页有无年/季度报告文字,若没有,则无需查找起始页
                rs=re.search('(年\s*|季\s*)度?\s*报\s*告?',teststr)
                #print(teststr)
                if rs!=None and count==1:
                    #第一张找到年报相关文字,在下一页查找目录二字
                    flag=True
                    continue
                elif rs==None and count==1:
                    #第一页未找到年/季报相关文字,查找第二页
                    #有的报告第一张具有印章,导致提取文字不全
                    print('第1页未检测到年/季报等文字,检测第二页')
                    continue
                elif rs!=None and count==2:
                    #第二页找到了年报相关文字,在第三页查找目录
                    flag=True
                    continue
                elif rs==None and count==2:
                    #如果第1页和第二页还是没找到年/季报字眼,则认为不是年/季度报文件
                    if flag==False:
                        device.close()
                        outfp.close()
                        print('当前文件的财务报表起始页为',start_page)
                        return start_page
                #如果第一页或第二页出现年报或季度报告字眼,则在前10页查找目录页
                if flag==True:
                    #1 对前10页进行处理
                    if count<11:
                        #查找目录页
                        if re.search('目\s*录',teststr,flags=0):

                            #查看含有目录两字的当前页中是否具有财务报表相关的目录名

                            #reg_stmt = re.compile(r'财务报告\D{10,}(\d{1,3})')
                            
                            ret = re.search('财务报告\s*(.)*\d', teststr)
                            if ret!=None:
                                ret=ret.group()
                                #去除空格
                                tstr=[y.strip() for y in re.split(r'[…¨ .]',ret) if len(y)!=0]
                                #第一个值未目录名,第二个值为页码
                                start_page=int(tstr[1])
                                device.close()
                                outfp.close()
                                print('当前文件的财务报表起始页为',start_page)
                                return start_page
                            else:
                                #含有目录两字的当前页未找到财务报表相关文字,对下一页处理
                                count+=1
                                continue
                        else:
                            #当前页未找到目录文字,继续判断下一页
                            print('第',count,'页未找到目录二字,查找下一页')
                            continue
                    else:
                        print('10页内未找到目录二字')
                        #10页内未找到目录页,则退出循环
                        break          

    else:
        #不超过30页不处理
        print('当前文件的财务报表起始页为',start_page)
        return start_page
    
    device.close()
    outfp.close()
    print('当前文件的财务报表起始页为',start_page)
    return start_page
示例#13
0
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from subprocess import call
from pdfminer.layout import LAParams
import os

url = 'http://www.ird.gov.hk/chi/pdf/c_s88list.pdf'
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
pdfdata = opener.open(url).read()
file = open('document.pdf', 'wb')
file.write(pdfdata)
file.close()
call('qpdf --password= --decrypt {0}/document.pdf {0}/decrypted.pdf'.format(
    os.getcwd()).split())

outfp = open('modifiedla.txt', 'w')
parser = PDFParser(open('decrypted.pdf', 'rb'))
document = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams(char_margin=10)
device = TextConverter(rsrcmgr, outfp, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)

outfp.close()
示例#14
0
def arc():
    destino = str(formato.get())
    if destino == "Arquivo do Word": destino = "docx"
    if destino == "Arquivo do Power-Point": destino = "ppt"
    if destino == "Arquivo do Excel": destino = "xlsx"
    if destino == "Arquivo de Texto": destino = "txt"
    import win32com.client as win32
    from os import path
    in_file = path.abspath(diretorio)
    out_file = path.abspath(filename)

    if destino == "docx":
        if file_extension in ArqDOCX or file_extension.lower(
        ) == ".pdf" or file_extension.lower() == ".txt":
            word = win32.DispatchEx("Word.Application")
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Documents.Open(in_file)
            doc.SaveAs(out_file, FileFormat=16)
            doc.Close()
            word.Quit()

    elif destino.lower() == "pdf":
        if file_extension.lower() in ArqPPT:
            word = win32.DispatchEx("PowerPoint.Application")
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Presentations.Open(in_file)
            doc.SaveAs(out_file, FileFormat=32)
            doc.Close()
            word.Quit()
        elif file_extension.lower() in ArqXLSX:
            word = win32.DispatchEx("Excel.Application")
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Workbooks.Open(in_file)
            doc.ExportAsFixedFormat(0, out_file)
            doc.Close()
            word.Quit()
        elif file_extension.lower() in ArqDOCX or file_extension.lower(
        ) == ".txt":
            word = win32com.client.Dispatch('Word.Application')
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Documents.Open(in_file)
            doc.SaveAs(in_file, FileFormat=17)
            doc.Close()
            word.Quit()

    elif destino.lower() == "xlsx":
        if file_extension.lower() == ".pdf":
            import pdftables_api
            c = pdftables_api.Client('to7jluln0hvr')
            c.xlsx(diretorio, filename + '.xlsx')
        elif file_extension.lower() == ".txt" or file_extension.lower(
        ) in ArqDOCX:
            import pandas as pd
            df = pd.read_csv(diretorio, header=None, delim_whitespace=True)
            df.to_excel(filename + '.xlsx', index=False, header=None)

    elif destino.lower() == "txt":
        if file_extension in ArqDOCX:
            import docx2txt
            text = docx2txt.process(diretorio)
            with open(filename + ".txt", "w") as file:
                print(text, file=file)
        elif file_extension.lower() == ".pdf":
            from io import StringIO
            from pdfminer.pdfparser import PDFParser
            from pdfminer.pdfdocument import PDFDocument
            from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
            from pdfminer.converter import TextConverter
            from pdfminer.layout import LAParams
            from pdfminer.pdfpage import PDFPage
            output_string = StringIO()
            with open(diretorio, 'rb') as in_file:
                parser = PDFParser(in_file)
                doc = PDFDocument(parser)
                rsrcmgr = PDFResourceManager()
                device = TextConverter(rsrcmgr,
                                       output_string,
                                       laparams=LAParams())
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                for page in PDFPage.create_pages(doc):
                    interpreter.process_page(page)
            with open(filename + ".txt", "w") as final:
                final.write(output_string.getvalue())
        elif file_extension.lower() in ArqXLSX:
            import pandas as pd
            read_file = pd.read_excel(diretorio, header=None)
            read_file.to_csv(filename + ".txt", index=None, header=True)

    messagebox.showinfo(
        "Formato convertido",
        "Formato de ficheiro convertido com sucesso.\n\n" +
        file_extension[1:].upper() + " para " + destino.upper() +
        "\n\nSalvo em: " + out_file + "." + destino)
    root.destroy()
示例#15
0
def _get_pdf_content(url, page_nums=[0]):
	"""
	(str) --> list

	Downloads the .pdf resume and parses it.
	"""
	resume = urllib.URLopener()
	# Just in case, try opening the .pdf with
	# the url found
	try:
		if type(url) != str and type(url) != unicode:
			raise TypeError
	except TypeError:
		raise TypeError(bcolors.FAIL + "Provided URL is bad type. URL must be a string" + bcolors.ENDC) 
	try:
		# If we are redirected, follow it
		r = requests.get(url)
		# Call the download file, "resume.pdf"
		resume.retrieve(r.url, "resume.pdf")
		content = ""

		# Use the PDFMiner package to grab the
		# text from the .pdf file
		rsrcmgr = PDFResourceManager()
		retstr = StringIO()
		codec = 'utf-8'
		laparams = LAParams()
		device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
		# Open the downloaded file here,
		# ---> 'rb' means hard read, regardless of the unicode
		pdf = file("resume.pdf", 'rb')
		interpreter = PDFPageInterpreter(rsrcmgr, device)
		password = ""
		maxpages = 1
		caching = True
		pagenos = set()
		for page in PDFPage.get_pages(pdf, pagenos, maxpages=maxpages, password=password, caching=caching,
									  check_extractable=True):
			interpreter.process_page(page)
		# Store the text as a string here
		text = retstr.getvalue()

		# Lower the text, for easier parse
		text = text.lower()

		# Use find_skills function, to parse the
		# text as it would with the HTML
		lstSkills = _find_skills(text)
		os.remove("resume.pdf")

		# Return the string of Skills
		return lstSkills
	except:
		# If we run into an error, continue on, and
		# move onto the next try statement
		pass
		
	# We found that some people don't like to
	# add "www" with their http://, which we
	# then attempt to fix here
	try:

		# Add www. to the string here
		url = url[:7] + "www." + url[7:]  #######################Try request

		# Do the same as the above try statement
		resume.retrieve(url, "resume.pdf")
		content = ""

		rsrcmgr = PDFResourceManager()
		retstr = StringIO()
		codec = 'utf-8'
		laparams = LAParams()
		device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
		pdf = file("resume.pdf", 'rb')
		interpreter = PDFPageInterpreter(rsrcmgr, device)
		password = ""
		maxpages = 1
		caching = True
		pagenos = set()
		for page in PDFPage.get_pages(pdf, pagenos, maxpages=maxpages, password=password, caching=caching,
									  check_extractable=True):
			interpreter.process_page(page)
		text = retstr.getvalue()
		text = text.lower()
		lstSkills = _find_skills(text)
		return lstSkills
	except:
		pass

	# Finally try a non-redirected route, and
	# parse the .pdf as we did in the first
	# try statement
	try:
		url = url[:7] + url[10:]
		resume.retrieve(url, "resume.pdf")
		content = ""
		rsrcmgr = PDFResourceManager()
		retstr = StringIO()
		codec = 'utf-8'
		laparams = LAParams()
		device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
		pdf = file("resume.pdf", 'rb')
		interpreter = PDFPageInterpreter(rsrcmgr, device)
		password = ""
		maxpages = 1
		caching = True
		pagenos = set()
		for page in PDFPage.get_pages(pdf, pagenos, maxpages=maxpages, password=password, caching=caching,
									  check_extractable=True):
			interpreter.process_page(page)
		text = retstr.getvalue()
		text = text.lower()
		lstSkills = _find_skills(text)
		return lstSkills
	except:
		# Return an empty array, indicating that
		# we were unable to retrieve any
		# information from your resume
		print bcolors.FAIL + "Unable to parse pdf with given URL" + bcolors.ENDC
		return []
示例#16
0
def create_candidates(path, sel_id, min_req, desire_req, remote_ind, scorer, model_type):
    """
    We start by getting text from resumes uploaded to AWS S3 bucket rosev0
    In case of pdf and docx we were able to process bytes making it easier
    to handle and make an inference. 
    On the other hand, we have a processed doc files with antiword, 
    sometimes the doc wiull not be processed in case of having weird format

    AWS 
        S3
            bucket: rosev0
                mail_user / replace @ with _
                    chilean date and selection name

    model_type = 3c or whole

    """
    #connecting to AWS S3
    s3 = boto3.resource("s3",
                    region_name='us-east-2',
                    aws_access_key_id=os.environ.get('AWS_KEY'),
                    aws_secret_access_key=os.environ.get('AWS_SECRET'))
    # loading models
    work_nlp = spacy.load('selection/models/work')
    ed_nlp = spacy.load('selection/models/education')
    per_nlp = spacy.load('selection/models/personal')
    whole_nlp = spacy.load('selection/models/whole')
    rose_bucket = s3.Bucket(r'rosev0')
    low_ind = 0
    high_ind = 0
    medium_ind = 0
    candidates = []
    for resume in rose_bucket.objects.filter(Prefix=path):
        key = resume.key
        body = resume.get()['Body'].read()
        buffer = io.BytesIO()
        buffer.write(body)
        ext = re.search('\.[a-z]+$', key)
        #print(key)
        ###body comes in binary stream, we have to decode it
        if ext == None:
            continue
        elif ext.group() == '.docx':
            document = Document(buffer)
            text = "\n".join([paragraph.text for paragraph in document.paragraphs])
            #print(string)
        elif ext.group() == '.pdf':
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos=set()
            for page in PDFPage.get_pages(buffer, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
                interpreter.process_page(page)
                #fp.close()
                #device.close()
                #retstr.close()
            text = retstr.getvalue()
            #print(string)
        elif ext.group() == '.doc':
            #LINUX version handles damaged files and text in docs
            split = str(key).split('/')
            #replace special characters, linux problem reading path
            filename = str(split[-1]).replace('$','_').replace('#','_')
            pathdoc = 'selection/tmp/' + filename
            #print('trying download in ' + pathdoc)
            rose_bucket.download_file(key, pathdoc)
            #doc_text = os.system('antiword "' + pathdoc + '"')
            try:
                output = subprocess.check_output('antiword "' + pathdoc + '"', shell=True)
                text = output.decode('utf-8')
            except:
                continue

        if text != None:
            d = {}
            results = []
            # first we create list for work classes, we need to procces them, get them together
            comp_work = []
            desig_work = []
            years_work = []
            other_work = []
            desig_ind = []
            years_ind = []
            idioms = []
            skills = []
            comb = []
            # then we do the same with college attributes
            grad_ed = []
            colleges = []
            degrees = []
            certif = []
            # then personal
            names = []
            locations = []
            mails = []
            phones = []
            if model_type == '3c':
                doc_work = work_nlp(text) 
                doc_ed = ed_nlp(text) 
                doc_per = per_nlp(text) 
                #create array with entity text from algorithm inference
                for ent in doc_work.ents:
                    for value in [ent.text]:
                        if ent.label_ == 'companies worked at':
                            comp_work.append(value)
                        elif ent.label_ == 'designation':
                            desig_work.append(value.lower())
                            desig_ind.append(ent.start_char)
                        elif ent.label_ == 'years of experience':
                            years_work.append(value)
                            years_ind.append(ent.start_char)
                        elif ent.label_ == 'idioms':
                            idioms.append(value.lower())
                        elif ent.label_ == 'skills':
                            skills.append(value.lower())
                        else:
                            other_work.append([ent.label_, value])
                        results.append([ent.label_, value, text.index(value)])
                for ent in doc_ed.ents:
                    for value in [ent.text]:
                        if ent.label_ == 'graduation year':
                            grad_ed.append(value)
                        elif ent.label_ == 'college':
                            colleges.append(value.lower())
                        elif ent.label_ == 'degree':
                            degrees.append(value.lower())
                        elif ent.label_ == 'certifications':
                            certif.append(value.lower())
                        results.append([ent.label_, value, text.index(value)])
                for ent in doc_per.ents:
                    for value in [ent.text]:
                        if ent.label_ == 'name':
                            names.append(value)
                        elif ent.label_ == 'location':
                            locations.append(value)
                        elif ent.label_ == 'mail':
                            mails.append(value)
                        elif ent.label_ == 'phone':
                            phones.append(value)
                        results.append([ent.label_, value, text.index(value)])
            elif model_type == 'whole':
                doc = whole_nlp(text) 
                for ent in doc.ents:
                    for value in [ent.text]:
                        if ent.label_ == 'companies worked at':
                            comp_work.append(value)
                        elif ent.label_ == 'designation':
                            desig_work.append(value.lower())
                            desig_ind.append(ent.start_char)
                        elif ent.label_ == 'years of experience':
                            years_work.append(value)
                            years_ind.append(ent.start_char)
                        elif ent.label_ == 'idioms':
                            idioms.append(value.lower())
                        elif ent.label_ == 'skills':
                            skills.append(value.lower())
                        elif ent.label_ == 'location':
                            locations.append(value)
                        elif ent.label_ == 'mail':
                            mails.append(value)
                        elif ent.label_ == 'phone':
                            phones.append(value)
                        elif ent.label_ == 'name':
                            names.append(value)
                        elif ent.label_ == 'graduation year':
                            grad_ed.append(value)
                        elif ent.label_ == 'college':
                            colleges.append(value.lower())
                        elif ent.label_ == 'degree':
                            degrees.append(value.lower())
                        elif ent.label_ == 'certifications':
                            certif.append(value.lower())
                        else:
                            other_work.append([ent.label_, value])
                        results.append([ent.label_, value, text.index(value)])
            if scorer == 'scorer':
                cand_json, high, low, medium = score_candidate(comp_work, desig_work, years_work, other_work, desig_ind, remote_ind,
                                                  years_ind, idioms, skills, comb, grad_ed, colleges, degrees, 
                                                  certif, names, locations, mails, phones, min_req, desire_req, sel_id)
                match = re.findall('(\[])', str(cand_json))
                if len(match) >= 5 and cand_json['info']['rank'] == 0:
                    pass
                else:
                    candidates.append(cand_json)
            if high:
                high_ind = high_ind + 1
            if low:
                low_ind = low_ind + 1
            if medium:
                medium_ind = medium_ind + 1
    return candidates, high_ind, low_ind, medium_ind
def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted (remote or local)
    :return: iterator of string of extracted text
    '''

    # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
    if not isinstance(pdf_path, io.BytesIO):

        # extract text from local pdf file

        with open(pdf_path, 'rb') as fh:
            try:
                for page in PDFPage.get_pages(fh,
                                              caching=True,
                                              check_extractable=True):
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(resource_manager,
                                              fake_file_handle,
                                              codec='utf-8',
                                              laparams=LAParams())
                    page_interpreter = \
                        PDFPageInterpreter(resource_manager, converter)
                    page_interpreter.process_page(page)

                    text = fake_file_handle.getvalue()
                    yield text

                    # close open handles
                    converter.close()
                    fake_file_handle.close()
            except PDFSyntaxError:
                return
    else:

        # extract text from remote pdf file

        try:
            for page in PDFPage.get_pages(pdf_path,
                                          caching=True,
                                          check_extractable=True):
                resource_manager = PDFResourceManager()
                fake_file_handle = io.StringIO()
                converter = TextConverter(resource_manager,
                                          fake_file_handle,
                                          codec='utf-8',
                                          laparams=LAParams())
                page_interpreter = PDFPageInterpreter(resource_manager,
                                                      converter)
                page_interpreter.process_page(page)

                text = fake_file_handle.getvalue()
                yield text

                # close open handles

                converter.close()
                fake_file_handle.close()
        except PDFSyntaxError:
            return
示例#18
0
def synthesize_pdf(
    pdf_file,
    json_file,
    dst_dir,
    max_fonts,
    max_pages,
    num_outputs_per_document,
    synthesizer_class,
):
    ground_truth = json.loads(json_file.read_text())
    pdf_io = BytesIO(pdf_file.read_bytes())
    output_string = StringIO()
    rsrcmgr = PDFResourceManager(caching=True)
    device = TextConverter(rsrcmgr,
                           output_string,
                           codec='utf-8',
                           laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    interpreter_fonts = {}

    def _out_path(_i, suffix):
        return dst_dir / f'{json_file.stem}-{_i}{suffix}'

    k_to_process = []
    for i in range(num_outputs_per_document):
        if not (_out_path(i, '.pdf').exists()
                and _out_path(i, '.json').exists()):
            k_to_process.append(i)

    if not k_to_process:
        raise AlreadyProcessed(f'Already processed {pdf_file} {json_file}')

    with pikepdf.Pdf.open(pdf_file) as pdf:
        if max_pages and len(pdf.pages) > max_pages:
            raise TooManyPagesException(
                f'Too many pages {len(pdf.pages)} > {max_pages} in PDF, skipping!'
            )

        for page_number, (page, miner) in enumerate(
                zip(pdf.pages, PDFPage.get_pages(pdf_io))):
            interpreter.process_page(miner)
            interpreter_fonts.update(interpreter.fontmap)

    if max_fonts and len(interpreter_fonts) > max_fonts:
        raise TooManyFontsException(
            f'Too many fonts {len(interpreter_fonts)} > {max_fonts} in PDF, skipping!'
        )

    if not re.sub(f'[{re.escape(string.whitespace)}]', '',
                  output_string.getvalue()):
        raise NoTextException('PDF does not have any text! Skipping')

    font_map = {
        f'/{k}': Font(f'/{k}', v)
        for k, v in interpreter_fonts.items()
    }
    synthesizer = synthesizer_class(ground_truth, font_map)

    with pikepdf.Pdf.open(pdf_file) as pdf:
        new_contents = collections.defaultdict(list)
        new_ground_truths = {}

        for i in k_to_process:
            for page_number, page in enumerate(pdf.pages):
                new_content_stream = parse_text(page, font_map, synthesizer)
                new_contents[i].append(
                    pdf.make_stream(
                        pikepdf.unparse_content_stream(new_content_stream)))

            new_ground_truths[i] = synthesizer.create_new_ground_truth()
            synthesizer.reset()

        for i in k_to_process:
            for page_number, page in enumerate(pdf.pages):
                page.Contents = new_contents[i][page_number]

            pdf.save(_out_path(i, '.pdf'))
            _out_path(i, '.json').write_text(
                json.dumps(new_ground_truths[i], indent=2))
def pdfToText(file, cid, catID, fileid, docType):
    #Create empty string for text to be extracted into
    keyWordObj = keywords.objects.exclude(categoryKeyWords=None)

    keyWords = []
    for n in keyWordObj:
        keyWords.append(n.categoryKeyWords)
    extracted_text = ''

    #Sets the cursor back to 0 in f to be parsed and sets the documents and parser
    file.seek(0)
    parser = PDFParser(file)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    # doc.encode('utf-8')
    rsrcmgr = PDFResourceManager()
    #sets parameters for analysis
    laparams = LAParams()

    #Required to define separation of text within pdf
    laparams.char_margin = 1
    laparams.word_margin = 1

    #Device takes LAPrams and uses them to parse individual pdf objects
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    dataArray = []

    #testing PRP
    #    botSearchArray = []
    if docType == 'Syllabus':
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                #if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                if isinstance(lt_obj, LTTextBoxHorizontal):
                    extracted_text = lt_obj.get_text()
                    filtText = restructString(extracted_text)
                    added = False

                    for n in keyWords:
                        if (n.lower() in filtText.lower()) and (
                                added == False) and (len(n) > 2):
                            dataArray.append(filtText)
                            keyWords.remove(n)
                            added = True
                        elif (len(n) < 3) and (re.match(
                            ('[a-zA-Z]*' + re.escape(n) + '[a-zA-Z]*'),
                                filtText) != None):
                            if (n in filtText) and (added == False):
                                dataArray.append(filtText)
                                keyWords.remove(n)
                                added = True

                        elif (n.lower() in filtText.lower()) and (added
                                                                  == True):
                            keyWords.remove(n)

                    if (len(dataArray) > 0) and (added == False):
                        #checking for empty lines or lines with just a page number
                        if (filtText != ' <br>') and (re.match(
                                '[0-9]* <br>',
                                filtText) == None) and (filtText != '<br>'):
                            dataArray[-1] += ' ' + filtText

                    elif len(dataArray) == 0:
                        dataArray.append('Course Information: ' + filtText)
        for n in dataArray:
            restructForDB(n, cid, catID, fileid)

    elif docType == 'Lecture':
        for page in doc.get_pages():
            dataArray.append('')
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                #if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                if isinstance(lt_obj, LTTextBoxHorizontal):
                    extracted_text = lt_obj.get_text()
                    filtText = restructString(extracted_text)
                    #checking for empty lines or lines with just a page number
                    if (filtText != ' <br>') and (re.match(
                            '[0-9]* <br>',
                            filtText) == None) and (filtText != '<br>'):
                        dataArray[-1] += '' + filtText
        for n in dataArray:
            restructForDB(n, cid, catID, fileid)


#        botSearchArray.append(restructForDB(n))
    elif docType == 'Assignment':
        #Sets the cursor back to 0 in f to be parsed and sets the documents and parser
        file.seek(0)
        parser = PDFParser(file)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        #sets parameters for analysis
        laparams = LAParams()
        retstr = StringIO()

        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        #num = False
        data = ""
        head = ""
        for page in doc.get_pages():
            read_position = retstr.tell()
            interpreter.process_page(page)
            retstr.seek(read_position, 0)
            page_text = retstr.read()
            page_text = re.sub('\\n', ' <br> ', page_text)
            page_text = re.sub('\\uf0b7|\\uf020|\\ufb01|\\uf8ff', '?',
                               page_text)
            #print('CHECK: ' + page_text)

            for word in page_text.split():
                skip = False
                if len(word) < 4:
                    for ind, c in enumerate(word):
                        if c.isdigit() and ind < len(word) - 1:
                            if word[ind + 1] == '.':
                                #print(data)
                                tempStr = data.replace('<br>', '')
                                while (tempStr[-1:] == ' '):
                                    tempStr = tempStr[:-1]
                                if tempStr != "":
                                    data = head + ' ' + data
                                    parsedToDB(data, cid, catID, fileid)
                                head = 'Question ' + word
                                data = ""
                                #num = True
                                skip = True
                                break
                if skip == False:
                    if data == "":
                        data = word
                    else:
                        data = data + ' ' + word

        if data != "":
            #print(data)
            data = head + ' ' + data
            parsedToDB(data, cid, catID, fileid)
        file.close()
示例#20
0
    def convert_pdf_to_txt(self, skiprows):
        try:
            self.url = str(self.objectj['object']['url'])
            self.list_columns = self.objectj['object']['columns']
            self.name_doc = self.objectj['object']['document_name']
            self.list_skiprows = self.objectj['object']['skiprows']
            print(self.url + "|\n URL\n")
            print(str(self.list_columns) + "|\n COLUMNS\n")
            print(self.name_doc + "\n Document Name\n")
            print(str(self.list_skiprows) + " \n Skiprows\n")
        except Exception as exobjason:
            print("Json Object Error: " + str(exobjason))
            time.sleep(5)
            quit()
        pdfname = r'.\_download_' + str(self.name_doc) + ".pdf"
        filename = wget.download(self.url, out=pdfname)
        iskiprows = int(skiprows)
        rsrcmgr = PDFResourceManager()
        retstr = io.StringIO()
        codec = 'utf-8-sig'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        path_pdf = open(filename, 'rb')
        pdfr = read_pdf(
            path_pdf,
            #guess=False,
            pages='all',
            pandas_options={
                'skiprows': iskiprows,
                'header': None
            },
            output_format="csv")
        #headers = ['NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'PHONE']
        #pdfr.columns = headers
        csvname = r'.\_generate_' + str(self.name_doc) + ".csv"
        pdfre = pdfr.to_csv(csvname)
        #pdfre = tabula.convert_into('.\ppp2.xlsx', "output.csv", output_format="csv")
        print("Head Csv File Generate\n" + str(pdfr.head()))
        print("Change Options in Json Object? ")
        print("-y Change options -enter pass ")
        optdocop = input("	Change Json Document Options? >:")
        if optdocop == "-y":
            url = input("	url >:")
            if url != "" and str(url).content('http://'):
                self.objectj['object']['url'] = str(url)
                print("Change Url" + str(self.objectj['object']['url']))
            columns = input("	columns (separate fields with ,) >:")
            lcolumns = []
            lcolumns = str(columns).split(",")

            if columns != "":
                self.objectj['object']['columns'] = lcolumns
                print("Change columns" +
                      str(self.objectj['object']['columns']))

            document_name = input("	document_name >:")
            if document_name != "":
                self.objectj['object']['document_name'] = str(document_name)
                print("Change document_name" +
                      str(self.objectj['object']['document_name']))

            skiprows = input("	skiprows (separate fields with ,) >:")
            lskiprows = []
            lskiprows = str(skiprows).split(",")
            if skiprows != "":
                self.objectj['object']['skiprows'] = lskiprows
                print("Change skiprows" +
                      str(self.objectj['object']['skiprows']))
            doctype = input("	doctype >:")
            if doctype != "":
                self.objectj['object']['doctype'] = str(doctype)
                print("Change doctype" +
                      str(self.objectj['object']['doctype']))
            try:
                with open(self.doc, 'r') as filerin:
                    self.objectj_copy = json.load(filerin)
                with open(self.doc, 'w') as filewin:
                    json.dump(self.objectj, filewin, indent=5)
            except Exception as exchangejson:
                print("Error I-O Json file" + str(exchangejson))
                time.sleep(4)
            try:
                with open(self.doc, 'r') as filerin1:
                    self.objectj = json.load(filerin1)
                    print("NEW OBJECT\nURL: " +
                          str(self.objectj['object']['url']))
                    print("COLUMNS: " + str(self.objectj['object']['columns']))
                    print("DOCUMENT NAME: " +
                          str(self.objectj['object']['document_name']))
                    print("SKIPROWS: " +
                          str(self.objectj['object']['skiprows']))
                    print("DOCUMENT_TYPE: " +
                          str(self.objectj['object']['doctype']))
                    self.__init__()
            except Exception as exchangejson:
                print("Error I-O Json file" + str(exchangejson))
# PyPDF2로 페이지 수 계산하기
filename = "north_korea_economic_growth.pdf"
filepath = os.path.join(os.getcwd(), "data", filename)
fp = open(filepath, 'rb')
total_pages = PyPDF2.PdfFileReader(fp).numPages
print(total_pages)

# pdfminer로 페이지별 텍스트 가져오기
page_text = {}
for page_no in range(total_pages):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(filepath, 'rb')
    password = None
    maxpages = 0
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    caching = True
    pagenos = [page_no]

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,
                                  caching=caching, check_extractable=True):
        interpreter.process_page(page)

    page_text[page_no] = retstr.getvalue()

    fp.close()
    device.close()
示例#22
0
文件: TEST.py 项目: hasiya/pdf_parse
def parsePDF(url):

    # Open the url provided as an argument to the function and read the content
    global data, data
    open = urllib2.urlopen(Request(url)).read()

    # Cast to StringIO object
    from StringIO import StringIO
    memory_file = StringIO(open)

    # Create a PDF parser object associated with the StringIO object
    parser = PDFParser(memory_file)

    # Create a PDF document object that stores the document structure
    document = PDFDocument(parser)

    # Define parameters to the PDF device objet
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    codec = 'utf-8'

    # Create a PDF device object
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    # Create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        data = retstr.getvalue()

    # Get values for Iowa B100 prices
    reg = '(?<=\n---\n\n)\d.\d{2}-\d.\d{2}'
    data_u = ('u', data)
    matches = re.findall(reg, data)  # Our data are contained in matches[0]

    # Compute the average
    # Extract value from previous regex
    low = re.search('\d.\d{2}(?=-)', matches[0])
    lpos = low.pos
    high = re.search('(?<=-)\d.\d{2}', matches[0])
    hpos = high.pos

    # Cast string variables to float type
    low_val = float(low.group(0))
    high_val = float(high.group(0))

    # Calculate the average
    #import numpy
    #value = [high_val, low_val]
    #print value.mean
    ave = (high_val + low_val) / 2

    # Search the date of the report
    reg = '\w{3},\s\w{3}\s\d{2},\s\d{4}'
    match = re.search(reg, data)  # Result is contained in matches[0]
    dat = match.group(0)

    # Cast to date format
    #import datetime
    #form = datetime.datetime.strptime(match.group(0), '%a, %b %d, %Y')
    #print form

    # http://stackoverflow.com/questions/9752958/how-can-i-return-two-values-from-a-function-in-python
    return (dat, ave)
示例#23
0
def readpdf(mypdf):
  rsrcmgr = PDFResourceManager()
  sio = StringIO()
  codec = 'utf-8'
  laparms = LAParams()
  device = TextConverter(rsrcmgr,sio,codec = codec, laparms)
示例#24
0
from os import path
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf

input_file_path = "../../../../etc"

print("-" * 40)

input_file = path.join(input_file_path, "SampleDoc.pdf")
print("Input file name:", input_file)

res_mgr = PDFResourceManager()
raw = StringIO()
laparams = LAParams()
device = TextConverter(res_mgr, raw, laparams=laparams)

pdf_file = open(input_file, "rb")
process_pdf(res_mgr, device, pdf_file)
device.close()
pdf_file.close()
content = raw.getvalue()
raw.close()

print("-" * 40)

print(content)

print("-" * 40)

if __name__ == '__main__':
示例#25
0
def main(argv):
    def usage():
        print((
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
            '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
            '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...'
            % argv[0]))
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    debug = False
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug = True
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)

    if debug:
        set_debug_logging()
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
        close_outfp = True
    else:
        outfp = sys.stdout
        close_outfp = False
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               outdir=outdir,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        fp = io.open(fname, 'rb')
        process_pdf(rsrcmgr,
                    device,
                    fp,
                    pagenos,
                    maxpages=maxpages,
                    password=password,
                    caching=caching,
                    check_extractable=True)
        fp.close()
    device.close()
    if close_outfp:
        outfp.close()
示例#26
0
def extract_text_from_pdf(pdf_path):

    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
        text = fake_file_handle.getvalue()
    converter.close()
    fake_file_handle.close()
    print(text)
    print('\n\n')

    if "Paramount Trading Corporation" in text:

        PO = re.search("PO(.*?)Invoice", text)
        PO = PO.group()
        PO = PO.replace("PO Ref : ", " ")
        PO = PO.replace(" Invoice", " ")
        print(PO)

        date = re.search("Date :.{8}", text)
        date = date.group()
        date = date.replace("Date :", " ")
        print(date)

        name = "Paramount Trading Corporation"
        print(name)

        add = re.search("Billing Address (.*?)Date", text)
        add = add.group().replace("Billing Address ", " ").replace("Date", " ")
        print(add)

        inv = re.search("Invoice No(.*?)%", text)
        inv = inv.group().replace("Invoice No:- ", " ").replace("%", " ")
        print(inv)

        #cpan = re.search("Customer PAN (.*?)Ship",text)
        #cpan = cpan.group().replace("Customer PAN No"," ").replace("Ship"," ")
        #print(cpan)

        #cgst = re.search("Customer GST (.*?)Customer", text)
        #cgst = cgst.group().replace("Customer GST No"," ").replace("Customer"," ")
        #print(cgst)

        gst = re.search("GST No : (.*?)PAN", text)
        gst = gst.group().replace("GST No : ", " ").replace("PAN", " ")
        gst = gst.replace("Paramount Trading Corporation  ", " ")
        print(gst)

        pan = re.search("PAN No : (.*?)Declaration", text)
        pan = pan.group().replace("PAN No : ", " ").replace("Declaration", " ")
        print(pan)

        total = re.search("18%.{300}", text)
        total = total.group().split(".")
        total = total[1][2:] + "." + total[2][:2]
        print(total)

        tax = re.search("18%.{300}", text)
        tax = tax.group().split(".")
        tax[0] = tax[0].replace("18%", " ")
        tax = tax[0] + "." + tax[1][:2]
        print(tax)

        des = re.search("Paramount Trading Corporation(.*?)#8", text)
        des = des.group()
        des = des.replace("Description", " ")
        des = des.replace("#8", " ")
        des = des.replace("Commercial Invoice", " ")
        des = des.replace("Shipping Method", " ")
        des = des.replace("Mode of Payment", " ")
        des = des.replace("Shipment Date", " ")
        des = des.replace("Hero MotoCorp Ltd.C/o", " ")
        des = des.replace(
            "The Grand New Delhi, Nelson Mandel Road, Vasant Kunj, Phase  IINew Delhi, India. Pin - 110070",
            " ")
        des = des.replace("Contact : Avinash +919557971063", " ")
        des = des.replace("Total", " ")
        des = des.replace("Paramount Trading Corporation", " ")
        des = des.replace("Road", " ")
        des = des.replace("11th June 2019", " ")
        des = des.replace("Hero MotoCorp Ltd.", " ")
        des = des.replace("Customer PO Ref : ", " ")
        des = des.replace(PO, " ")
        des = des.replace("Invoice No:- ", " ")
        des = des.replace("GST No : ", " ")
        des = des.replace(gst, " ")
        des = des.replace("PAN No : ", " ")
        des = des.replace("%", " ")
        des = des.replace(
            "Declaration:We declare that this invoice shows the actual price of the goodsdescribed and that all particulars are true and correct.",
            " ")
        des = des.replace("Authorised Signatory", " ")
        des = des.replace("advance balance 60  ", " ")
        des = des.replace("against delivery", " ")
        des = des.replace(inv, " ")
        des = des.replace(pan, " ")
        des = des.replace("(round off)", " ")
        print(des)

    elif "SONATA" in text:

        PO = re.search("Cust PO Ref & Date(.*?)/", text)
        PO = PO.group().replace("Cust PO Ref & Date: ", " ").replace("/", " ")
        print(PO)

        date = re.search("Invoice Date: (.*?)BILL", text)
        date = date.group().replace("Invoice Date: ", " ").replace("BILL", " ")
        print(date)

        name = "SONATA INFORMATION TECHNOLOGY LIMITED"
        print(name)

        add = re.search("INVOICESONATA INFORMATION TECHNOLOGY LIMITED(.*?)TEL",
                        text)
        add = add.group().replace(
            "INVOICESONATA INFORMATION TECHNOLOGY LIMITED",
            " ").replace("TEL", " ")
        print(add)

        inv = re.search("Invoice No.:(.*?)Invoice", text)
        inv = inv.group().replace("Invoice No.:", " ").replace("Invoice", " ")
        print(inv)

        gst = re.search("GSTIN : (.*?)PAN", text)
        gst = gst.group().replace("GSTIN : ", " ").replace("PAN", " ")
        print(gst)

        pan = re.search("Our PAN is (.*?)and", text)
        pan = pan.group().replace("Our PAN is ", " ").replace("and", " ")
        print(pan)

        total = re.search("Total Invoice Value  (.*?)of", text)
        total = total.group().split(".")
        total[0] = total[0].replace("Total Invoice Value  ", " ")
        total = total[0] + "." + total[1][:2]
        print(total)

        tax = re.search("Total Tax Value(.*?)Total", text)
        tax = tax.group().replace("Total Tax Value", " ").replace("Total", " ")
        print(tax)

        des = re.search("Description of Goods/Services(.*?)Each", text)
        des = des.group()
        des = des.replace("Description of Goods/Services", " ")
        des = des.replace("Each", " ")
        des = des.replace("Qty", " ")
        des = des.replace("UOM", " ")
        des = des.replace("Rate", " ")
        des = des.replace("(INR)", " ")
        des = des.replace("Amount", " ")
        print(des)

    elif "Concoct Human Resources Practitioners India" in text:

        PO = re.search("eWay Bill No#.{300}", text)
        PO = PO.group().split(" ")
        PO = PO[13]
        print(PO)

        date = re.search("eWay Bill No#.{300}", text)
        date = date.group().split(" ")
        date = date[12]
        print(date)

        name = "Concoct Human Resources Practitioners India"
        print(name)

        add = re.search("#(.*?)Proforma", text)
        add = add.group().replace("Proforma", " ")
        print(add)

        inv = re.search("Invoice No: (.*?)PAN", text)
        inv = inv.group().replace("Invoice No: ", " ").replace("PAN", " ")
        print(inv)

        gst = re.search("IGST No#:(.*?)IEC", text)
        gst = gst.group().replace("IGST No#:", " ").replace("IEC", " ")
        print(gst)

        pan = re.search("PAN No: (.*?)GSTIN", text)
        pan = pan.group().replace("PAN No: ", " ").replace("GSTIN", " ")
        print(pan)

        total = re.search("Total Inc. of GST @ 18%(.*?)Amount", text)
        total = total.group().replace("Total Inc. of GST @ 18%",
                                      " ").replace("Amount", " ")
        print(total)

        tax = "Not given separately"
        print(tax)

        des = re.search("Particulars(.*?)Total", text)
        des = des.group()
        des = des.replace("Particulars", " ")
        des = des.replace("Product", " ")
        des = des.replace("S/N", " ")
        des = des.replace("No# of Units", " ")
        des = des.replace("Price Per Unit", " ")
        des = des.replace("GST @ 18%", " ")
        des = des.replace("Amount", " ")
        des = des.replace("(INR)", " ")
        des = des.split(".")
        #des = re.findall("[a-z]",des)
        l = len(des)
        for i in range(0, l - 1):
            if "Unit" in des[i]:
                desi = des[i].split("Unit")
                desi = desi[0]
                print(desi)

    elif "MicroGenesis CADSoft" in text:

        PO = "Not given"
        print(PO)

        date = re.search("Despatched throughDated(.*?)Mode", text)
        date = date.group().replace("Despatched throughDated",
                                    " ").replace("Mode", " ")
        print(date)

        name = "MicroGenesis CADSoft"
        print(name)

        add = re.search("MicroGenesis CADSoft(.*?)MSMED", text)
        add = add.group().replace("MSMED",
                                  " ").replace("MicroGenesis CADSoft Pvt Ltd",
                                               " ")
        print(add)

        inv = re.search("Invoice No.(.*?)Delivery", text)
        inv = inv.group().replace("Invoice No.", " ").replace("Delivery", " ")
        print(inv)

        gst = re.search("GSTIN/UIN:(.*?)State", text)
        gst = gst.group().replace("GSTIN/UIN:", " ").replace("State", " ")
        print(gst)

        pan = re.search("Company's PAN :(.*?)Dec", text)
        pan = pan.group().replace("Company's PAN :", " ").replace("Dec", " ")
        print(pan)

        total = re.search("Total₹(.*?)No", text)
        total = total.group().replace("Total", " ").replace("No", " ")
        print(total)

        tax = re.search("IGST @ 18%(.*?)%", text)
        tax = tax.group().replace("IGST @ 18%", " ").replace("%", " ")
        print(tax)

        des = re.search("SACNo.Services(.*?)No", text)
        des = des.group().replace("SACNo.Services", " ").replace("No", " ")
        print(des)
示例#27
0
def main(argv):
    import getopt

    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
示例#28
0
def split(path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    stu_numbers = []
    stu_names = []
    indices = []
    dsn_tns = cx_Oracle.makedsn('connection', 'database ssid')
    db = cx_Oracle.connect(user='******', password='******', dsn=dsn_tns)
    cursor = db.cursor()  # assign db operation to cursor variable
    sql = '''UPDATE u_studentsuserfields u set u.ausd_hashkey = :hashid where u.studentsdcid = (SELECT s.dcid from
    students s where s.dcid = u.studentsdcid and s.student_number = :stu_numid) '''

    #begin the process extracting text data from reports
    anum = "Student ID:"  # begin text search param
    bnum = "Parent"  # end text search param
    aname = "Report for"
    bname = "School"
    count = 0
    # PDF Text Extraction
    with open(path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            indices.append(count)
            page_interpreter.process_page(page)
            count += 1
            text = fake_file_handle.getvalue()

            # Number Processing
            stu_num = text.split(anum)[-1].split(bnum)[0]
            stu_num = str(stu_num)
            stu_num = stu_num.strip()
            stu_num = stu_num.replace(" ", "")
            stu_numbers.append(stu_num)

            # Name Processing
            stu_name = text.split(aname)[-1].split(bname)[0]
            stu_name = str(stu_name)
            stu_name = stu_name.strip()
            stu_name = stu_name.replace("/x00", "")
            stu_name = stu_name.replace("\x00", "")
            stu_name = stu_name.replace("Student", "")
            stu_names.append(stu_name)

            print(stu_num + " " + stu_name + "'s report has been parsed")

    converter.close()
    fake_file_handle.close()

    # PDF Write Out
    pdf_file_obj = open(path, 'rb')  # open allows you to read the file
    pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
    num_pages = pdf_reader.getNumPages()
    count2 = 0
    for page in range(pdf_reader.getNumPages()):
        while count2 < num_pages:  # The while loop will read each page
            pageobj = pdf_reader.getPage(count2)
            cur_stunum = stu_numbers[count2]
            pdf_writer = PyPDF2.PdfFileWriter()
            pdf_writer.addPage(pageobj)
            ausdhash = str(uuid.uuid4())
            # params = {'hashid': ausdhash, 'stu_numid': cur_stunum}
            # cursor.execute(sql, params)
            output_filename = '{}.pdf'.format(ausdhash)
            outdir = os.path.join(reportdir, output_filename)
            count2 += 1
            with open(outdir, "wb") as out:
                pdf_writer.write(out)
            print("generated " + cur_stunum + " " + output_filename)
示例#29
0
文件: v1.py 项目: charlesyuyue/AFP
def extractAccountingPolicySection(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    extracted = ""
    foundTitle = 0
    ifInSection = 0

    pageCount = 1
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        if pageCount >= 40:
            interpreter.process_page(page)

            # Get all text in the current page
            data = retstr.getvalue()

            for line in data.splitlines():
                # if in the section of accounting disclosure
                if ifInSection == 1:
                    if line.strip():
                        # if reach section of taxation
                        if "税项" in line:
                            return extracted
                        extracted += line
                else:
                    if line.strip():
                        # if keywords "accounting policy" present
                        if "会计政策" in line and "会计估计" in line:
                            # if it is not in the target section, clear everything
                            if foundTitle == 1 and ifInSection == 0:
                                extracted = ""
                                foundTitle = 0
                            foundTitle = 1
                            extracted += line
                        # if found the subtitle of the target section
                        elif "遵循企业会计" in line:
                            ifInSection = 1
                            extracted += line
            data = ''
            retstr.truncate(0)
            retstr.seek(0)
        pageCount = pageCount + 1

    fp.close()
    device.close()
    retstr.close()
    return extracted
示例#30
0
    def __init__(self, pdf_stream, password='', pagenos=[], maxpages=0):
        ReaderBackend.__init__(self)
        self.pdf_stream = pdf_stream

        # Extract Metadata
        parser = PDFParser(pdf_stream)
        doc = PDFDocument(parser, password=password, caching=True)
        if doc.info:
            for k in doc.info[0]:
                v = doc.info[0][k]
                # print(repr(v), type(v))
                if isinstance(v, (bytes, str, unicode)):
                    self.metadata[k] = make_compat_str(v)
                elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)):
                    self.metadata[k] = make_compat_str(v.name)

        # Secret Metadata
        if 'Metadata' in doc.catalog:
            metadata = resolve1(doc.catalog['Metadata']).get_data()
            # print(metadata)  # The raw XMP metadata
            # print(xmp_to_dict(metadata))
            self.metadata.update(xmp_to_dict(metadata))
            # print("---")

        # Extract Content
        text_io = BytesIO()
        rsrcmgr = PDFResourceManager(caching=True)
        converter = TextConverter(rsrcmgr,
                                  text_io,
                                  codec="utf-8",
                                  laparams=LAParams(),
                                  imagewriter=None)
        interpreter = PDFPageInterpreter(rsrcmgr, converter)

        self.metadata["Pages"] = 0
        self.curpage = 0
        for page in PDFPage.get_pages(self.pdf_stream,
                                      pagenos=pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=True,
                                      check_extractable=False):
            # Read page contents
            interpreter.process_page(page)
            self.metadata["Pages"] += 1
            self.curpage += 1

            # Collect URL annotations
            # try:
            if page.annots:
                refs = self.resolve_PDFObjRef(page.annots)
                if refs:
                    if isinstance(refs, list):
                        for ref in refs:
                            if ref:
                                self.references.add(ref)
                    elif isinstance(refs, Reference):
                        self.references.add(refs)

            # except Exception as e:
            # logger.warning(str(e))

        # Remove empty metadata entries
        self.metadata_cleanup()

        # Get text from stream
        self.text = text_io.getvalue().decode("utf-8")
        text_io.close()
        converter.close()
        # print(self.text)

        # Extract URL references from text
        for url in extractor.extract_urls(self.text):
            self.references.add(Reference(url, self.curpage))

        for ref in extractor.extract_arxiv(self.text):
            self.references.add(Reference(ref, self.curpage))

        for ref in extractor.extract_doi(self.text):
            self.references.add(Reference(ref, self.curpage))