def create_individual_pdfs(revue): path = revue.conf["Paths"] # Create front pages for individual actors, if they don't already exist: frontpages_list = [] for actor in revue.actors: file_name = "forside-{}.pdf".format(actor.name) if not os.path.isfile(os.path.join(path["pdf cache"], file_name)): tex = TeX(revue) tex.create_frontpage(subtitle=actor.name) frontpages_list.append([tex, file_name]) # Create front pages: conv = cv.Converter() conv.parallel_textopdf(frontpages_list, outputdir=path["pdf cache"]) total_list = [] for actor in revue.actors: individual_list = (os.path.join(path["pdf cache"], "forside-{}.pdf".format(actor.name)), os.path.join(path["pdf"],"aktoversigt.pdf"), os.path.join(path["pdf"],"rolleliste.pdf"), actor, os.path.join(path["pdf"],"rekvisitliste.pdf")) total_list.append((individual_list, os.path.join(path["individual pdf"], "{}.pdf".format(actor.name)))) pdf = PDF() pdf.parallel_pdfmerge(total_list)
def create_song_manus_pdf(revue): path = revue.conf["Paths"] # Create front page, if it doesn't already exist: # if not os.path.exists(os.path.join(path["pdf"], "cache")): # os.mkdir(os.path.join(path["pdf"], "cache")) # if not os.path.isfile(os.path.join(path["pdf"], "cache", "forside-sangmanuskript.pdf")): # Det tager vare på sig selv nu tex = TeX(revue) tex.create_frontpage(subtitle="sangmanuskript") tex.topdf("forside-sangmanuskript.pdf", outputdir=os.path.join(path["pdf"], "cache")) # Create song manuscript: file_list = [ os.path.join(path["pdf"], "cache", "forside-sangmanuskript.pdf") ] for act in revue.acts: for material in act.materials: if material.category == path["songs"]: file_list.append((os.path.join( path["pdf"], os.path.dirname(os.path.relpath(material.path)), "{}.pdf".format(material.file_name[:-4])), material.title)) pdf = PDF() pdf.pdfmerge(file_list, os.path.join(path["pdf"], "sangmanuskript.pdf"))
class TextToPDF: def __init__(self, font='JetBrainsMono'): self.pdf = PDF(font) self.errors = [] def input_from_file(self, path): try: self.pdf.print_chapter(path) except (FileNotFoundError, PermissionError) as ex: self.errors.append(f"{path} file does not exist") def input_from_files(self, paths): for path in paths: self.input_from_file(path) def input_from_package(self, path, extension=".java"): try: # files = [path + "/" + file_path for file_path in os.listdir(path) if file_path.endswith(extension)] files = [path + "/" + file_path for file_path in os.listdir(path)] self.input_from_files(files) except FileNotFoundError as ex: self.errors.append(f"{path} folder does not exist") def output(self, path): try: self.pdf.output(path, 'F') except FileNotFoundError as ex: self.errors.append(f"{path} folder does not exist")
def __init__(self, settings): self.context = None self.totalCount = 0 self.cfbf = CFBF() self.pdf = PDF() self.ooxml = OOXML() self.xls_result = [] self.ppt_result = [] self.doc_result = [] self.xlsx_result = [] self.pptx_result = [] self.docx_result = [] self.PyPDF_result = [] self.PDFNoModule_result = []
def OnPrintPage( self, page ): exportGrid = self.prepareGrid( page ) category = self.pageInfo[page][0] pageNumber = self.pageInfo[page][3] pageTotal = self.pageInfo[page][4] fname = u'{fileBase}-{categoryName}.pdf'.format( fileBase = self.fileBase, categoryName = category.fullname if category != 'Primes' else 'Primes' ) fname = Utils.RemoveDisallowedFilenameChars( fname ).replace( ' ', '-' ) if not self.pdf: self.pdf = PDF( orientation = 'L' if self.orientation == wx.LANDSCAPE else 'P' ) self.pdf.set_font( 'Arial', '', 12 ) self.pdf.set_author( unicode(getpass.getuser()).encode('iso-8859-1','ignore') ) self.pdf.set_keywords( unicode('CrossMgr Results').encode('iso-8859-1','ignore') ) self.pdf.set_creator( unicode(Version.AppVerName).encode('iso-8859-1','ignore') ) self.pdf.set_title( unicode(os.path.splitext(fname)[0].replace('-', ' ')).encode('iso-8859-1','ignore') ) exportGrid.drawToFitPDF( *([self.pdf, self.orientation] + self.pageInfo[page][1:-1]) ) if not self.allInOne and pageNumber == pageTotal: if self.dir and not os.path.isdir( self.dir ): os.mkdir( self.dir ) fname = os.path.join( self.dir, fname ) self.pdf.output( fname, 'F' ) self.lastFName = fname self.pdf = None return True
def test(): # logging.basicConfig(level=logging.INFO) query = HKEX_API(from_date=n_yearsago(n=1), to_date=today()) for data in query.data: try: url = data.file_link # url = 'https://www.dropbox.com/' print(url) pdf = PDF(url) pdf_obj = pdf.pdf_obj f = AuditFee(pdf_obj) tab_sum = [] for table in f.tables: tab_sum.append(table.summary) except KeyboardInterrupt: break except Exception as e: # print(e) result = { 'table_summary' : e, 'ERROR': True, 'url' : url, } write_to_csv(result, 'result_3.csv') continue else: # print('ok') result = { 'table_summary' : list(filter(None, tab_sum)), 'ERROR': None, 'url' : url } write_to_csv(result, 'result_3.csv')
def create_individual_pdfs(revue): path = revue.conf["Paths"] ## Create front pages for individual actors, if they don't already exist: # frontpages_list = [] # for actor in revue.actors: # file_name = "forside-{}.pdf".format(actor.name) # if not os.path.isfile(os.path.join(path["pdf cache"], file_name)): # tex = TeX(revue) # tex.create_frontpage(subtitle=actor.name) # frontpages_list.append([tex, file_name]) # Det burde ordne sig selv nu: def tex_for_front_page(name): tex = TeX(revue) tex.create_frontpage(subtitle=name) return tex frontpages_list = [[ tex_for_front_page(actor.name), "forside-{}.pdf".format(actor.name) ] for actor in revue.actors] # Create front pages: conv = cv.Converter() conv.parallel_textopdf(frontpages_list, outputdir=path["pdf cache"]) total_list = [] for actor in revue.actors: individual_list = ((os.path.join(path["pdf cache"], "forside-{}.pdf".format(actor.name)), "Forside"), (os.path.join(path["pdf"], "aktoversigt.pdf"), "Aktoversigt"), (os.path.join(path["pdf"], "rolleliste.pdf"), "Rolleliste"), actor, (os.path.join(path["pdf"], "kontaktliste.pdf"), "Kontaktliste")) total_list.append((individual_list, os.path.join(path["individual pdf"], "{}.pdf".format(actor.name)))) pdf = PDF() pdf.parallel_pdfmerge(total_list)
def OnPrintPage(self, page): exportGrid = self.prepareGrid(page) try: category = self.pageInfo[page][0] except Exception as e: # Handle case of no data. return True pageNumber = self.pageInfo[page][3] pageTotal = self.pageInfo[page][4] fname = u'{fileBase}-{categoryName}.pdf'.format( fileBase=self.fileBase, categoryName=category.fullname if category != 'Primes' else 'Primes') fname = Utils.RemoveDisallowedFilenameChars(fname).replace(' ', '-') if not self.pdf: self.pdf = PDF( orientation='L' if self.orientation == wx.LANDSCAPE else 'P') self.pdf.set_font('Arial', '', 12) self.pdf.set_author( unicode(getpass.getuser()).encode('iso-8859-1', 'ignore')) self.pdf.set_keywords( unicode('CrossMgr Results').encode('iso-8859-1', 'ignore')) self.pdf.set_creator( unicode(Version.AppVerName).encode('iso-8859-1', 'ignore')) self.pdf.set_title( unicode(os.path.splitext(fname)[0].replace('-', ' ')).encode( 'iso-8859-1', 'ignore')) exportGrid.drawToFitPDF(*([self.pdf, self.orientation] + self.pageInfo[page][1:-1])) if not self.allInOne and pageNumber == pageTotal: if self.dir and not os.path.isdir(self.dir): os.mkdir(self.dir) fname = os.path.join(self.dir, fname) self.pdf.output(fname, 'F') self.lastFName = fname self.pdf = None return True
def create_song_manus_pdf(revue): path = revue.conf["Paths"] # Create front page, if it doesn't already exist: if not os.path.exists(os.path.join(path["pdf"], "cache")): os.mkdir(os.path.join(path["pdf"], "cache")) if not os.path.isfile(os.path.join(path["pdf"], "cache", "forside-sangmanuskript.pdf")): tex = TeX(revue) tex.create_frontpage(subtitle="sangmanuskript") tex.topdf("forside-sangmanuskript.pdf", outputdir=os.path.join(path["pdf"], "cache")) # Create song manuscript: file_list = [os.path.join(path["pdf"], "cache", "forside-sangmanuskript.pdf")] for act in revue.acts: for material in act.materials: if material.category == path["songs"]: file_list.append(os.path.join(path["pdf"], path["songs"], "{}.pdf".format(material.file_name[:-4]))) pdf = PDF() pdf.pdfmerge(file_list, os.path.join(path["pdf"],"sangmanuskript.pdf"))
def main(config=None, label=None): # Setup PDF Generator pdf = PDF("A4") # Start initial page, always one pdf.add_page() # Command run from console with label input # Generate label and exit if label: # Add all fonts here pdf.add_font("cambria", "", "fonts/cambria.ttf", True) pdf.add_font("cambria", "B", "fonts/cambria_B.ttf", True) pdf.generate(**label) return if config: server = Server(config, pdf) server.run()
def post(self): title = self.get_argument('title', 'This is the title') circles = int(self.get_argument('circles', 10)) lines = int(self.get_argument('lines', 10)) pdf = PDF() pdf.start(title, circles, lines) pdf.set_title(title) filename = pdf.save() self.write(filename)
def print_aso_bib(participant, copies=2): license_holder = participant.license_holder copies = int(copies) pdf = PDF(orientation='P') pdf.set_subject('Bib number and rider info in aso format.') pdf.set_keywords( 'RaceDB CrossMgr Bicycle Racing Software Database Road Time Trial MTB CycloCross RFID' ) for c in xrange(copies): aso_bib(pdf, participant.bib, license_holder.first_name, license_holder.last_name, participant.competition.name) return pdf.output(dest='s')
def print_aso_bib_two_per_page(participant): pdf = PDF(orientation='P') pdf.set_subject( 'Bib number and rider info in modified aso format, two per page.') pdf.set_keywords( 'RaceDB CrossMgr Bicycle Racing Software Database Road Time Trial MTB CycloCross RFID' ) license_holder = participant.license_holder aso_bib_two_per_page(pdf, participant.bib, license_holder.first_name, license_holder.last_name, participant.competition.name) return pdf.output(dest='s')
def debug(url, p): pdf = PDF(url) pdf_obj = pdf.pdf_obj f = AuditFee(pdf_obj) section = f.target_section(p) print(section.extract_text()) table = AuditFeeTable(section) # print('curr_idx:', table.currency_idx) # print("amount_idx:", table.amount_idx) # print("year_idx", table.year_idx) # print("co_row_idx:", table.co_row_idx) # print("co_col_idx:",table.co_col_idx) # print(table.raw_table) print(table.table) print(table.summary)
def process_pdf(input_path, db_feed): ''' kicks off pdf processing db_feed - pdf json from db input_path - path to pdf ''' # initializing pdf object & set parameters pdf_object = PDF(db_feed) # parsing pdf - input path (?) pdf_content = PDFparser().parse(input_path) # building dataframe df = PDFparser().create_df(pdf_content, pdf_object.page_pattern, pdf_object.table_pattern, pdf_object.column) # continue df transformation & filtering only if parsing success if (df is not None): return process_df(pdf_object, df) else: print('no dataframe passed')
def main(): args = parse() # opciones pdf = PDF() pdf.input_title() if args.clear: data_clear(pdf) elif args.cities: compare_city(args.cities, pdf) elif args.states: compare_state(args.states, pdf) elif args.type: compare_restaurant(pdf, args.type) else: print( 'Error: se requiere uno o mas argumentos para realizar la accion. Pulsa -h para más información' ) url_pdf = url + "/../pdf/" pdf.output(url_pdf + "analytic.pdf", 'F')
def print_bib_tag_label(participant, sponsor_name=None, left_page=True, right_page=True, barcode=True): competition = participant.competition license_holder = participant.license_holder bib = participant.bib name = license_holder.first_last if len(name) > 32: name = license_holder.first_last_short if sponsor_name is None: if competition.number_set and competition.number_set.sponsor: sponsor_name = competition.number_set.sponsor else: sponsor_name = competition.name system_name = 'CrossMgr' # Use points at the units. page_width = 3.9 * inches_to_points page_height = 2.4 * inches_to_points pdf = PDF('L', (page_height, page_width)) pdf.set_author(RaceDBVersion) pdf.set_title('Race Bib Number: {}'.format(bib)) pdf.set_subject( 'Bib number and rider info to be printed as a label to apply on the chip tag.' ) pdf.set_creator(getpass.getuser()) pdf.set_keywords( 'RaceDB CrossMgr Bicycle Racing Software Database Road Time Trial MTB CycloCross RFID' ) pdf.add_font('din1451alt', style='', fname=get_font_file('din1451alt G.ttf'), uni=True) pdf.add_font('Arrows', style='', fname=get_font_file('Arrrows-Regular.ttf'), uni=True) margin = min(page_height, page_width) / 18.0 sep = margin / 2.5 height = page_height - margin * 2.0 width = page_width - margin * 2.0 header = Rect(margin, margin, width, height / 18.0) footer = Rect(margin, page_height - margin - header.height, header.width, header.height) field = Rect(header.x, header.bottom + sep, width, footer.top - header.bottom - sep * 2) license_code = license_holder.uci_id or license_holder.license_code leftArrow, rightArrow = 'A', 'a' font_name = 'Helvetica' for lp in ([True] if left_page else []) + ([False] if right_page else []): pdf.add_page() arrow = copy.deepcopy(header) arrow.y -= arrow.height * 0.5 arrow.height *= 2 pdf.set_font('Arrows') arrowWidth = arrow.draw_text_to_fit( pdf, leftArrow if lp else rightArrow, (Rect.AlignLeft if lp else Rect.AlignRight) | Rect.AlignMiddle, consider_descenders=True, convert_to_text=False, ) arrowWidth += pdf.get_string_width(' ') header_remain = copy.deepcopy(header) if lp: header_remain.x += arrowWidth header_remain.width -= arrowWidth pdf.set_font(font_name) header_remain.draw_text_to_fit( pdf, sponsor_name, (Rect.AlignLeft if lp else Rect.AlignRight) | Rect.AlignMiddle, True) pdf.set_font('din1451alt', '', 16) field.draw_text_to_fit(pdf, bib, Rect.AlignCenter | Rect.AlignMiddle) pdf.set_font(font_name) name_width = footer.draw_text_to_fit( pdf, name, (Rect.AlignRight if lp else Rect.AlignLeft) | Rect.AlignMiddle) logo = copy.deepcopy(footer) if not lp: logo.x += name_width + sep logo.width -= name_width + sep if logo.width > 20: logo_width = logo.draw_text_to_fit( pdf, system_name, (Rect.AlignLeft if lp else Rect.AlignRight) | Rect.AlignMiddle) else: logo_width = 0 if barcode: remaining_width = header.width - name_width - logo_width if lp: barcode_rect = Rect(footer.x + logo_width, footer.y, remaining_width, footer.height) else: barcode_rect = Rect( footer.right - logo_width - remaining_width, footer.y, remaining_width, footer.height) if license_code: draw_code128(pdf, license_code, barcode_rect.x, barcode_rect.y, barcode_rect.width, barcode_rect.height) pdf_str = pdf.output(dest='s') return pdf_str
def print_bib_on_rect(bib, license_code=None, name=None, logo=None, widthInches=5.9, heightInches=3.9, copies=1, onePage=False): page_width = widthInches * inches_to_points page_height = heightInches * inches_to_points pdf = PDF('L', (page_height * (copies if onePage else 1), page_width)) pdf.set_author(RaceDBVersion) pdf.set_title('Race Bib Number: {}'.format(bib)) pdf.set_subject('Bib number.') pdf.set_creator(getpass.getuser()) pdf.set_keywords( 'RaceDB CrossMgr Bicycle Racing Software Database Road Time Trial MTB CycloCross RFID' ) pdf.add_font('din1451alt', style='', fname=get_font_file('din1451alt G.ttf'), uni=True) margin = min(page_height, page_width) / 17.5 sep = margin / 2.5 height = page_height - margin * 2.0 width = page_width - margin * 2.0 text_margin = margin text_height = margin * 0.4 for c in xrange(copies): if c == 0 or not onePage: pdf.add_page() page_y = 0 else: page_y = page_height * c pdf.dashed_line(0, page_y, page_width, page_y, space_length=12) pdf.set_font('din1451alt', '', 16) field = Rect(margin, margin + page_y, width, height) field.draw_text_to_fit(pdf, bib, Rect.AlignCenter | Rect.AlignMiddle) pdf.set_font('Helvetica') if logo: x = text_margin logo_rect = Rect(x, page_height - margin + page_y, (page_width - barcode_width_max) / 2.0 - x, text_height) logo_rect.draw_text_to_fit(pdf, logo, Rect.AlignLeft | Rect.AlignMiddle) if license_code: barcode_rect = Rect(margin, page_height - margin * 1.2 + page_y, width, margin * 0.8) draw_code128(pdf, license_code, barcode_rect.x, barcode_rect.y, barcode_rect.width, barcode_rect.height) if name: x = (page_width + barcode_width_max) / 2.0 name_rect = Rect(x, page_height - margin + page_y, page_width - text_margin - x, text_height) name_rect.draw_text_to_fit(pdf, name, Rect.AlignRight | Rect.AlignMiddle) pdf_str = pdf.output(dest='s') return pdf_str
from pdf import PDF, Word, Document, Html #function def show_common(obj): if type(obj) == PDF: print(obj.show()) elif type(obj) == Word: print(obj.show()) elif type(obj) == Html: obj.print() elif type(obj) == Document: print(obj.show()) # print(obj.name() + ' is an abstract class. Cannot say anything!') docs = [PDF('Doc1'), Word('Doc2'), Document('Doc3'), Html('Doc4')] for d in docs: # print(d.show()) show_common(d)
# finds path of files # making an output directory for pdf merged output_merge = file_path_variable + '/output_merged' if not os.path.exists(output_merge): os.mkdir(output_merge) for subdir, dirs, files in os.walk(file_path_variable): for filename in files: filepath = subdir + os.sep + filename print(filepath) ppt = Ex() if filepath.endswith(".ppt"): folder = filepath.replace('/', '\\') ppt.ppt_convert(folder) if filepath.endswith(".pptx"): folder = filepath.replace('/', '\\') ppt.ppt_convert(folder) if filepath.endswith(".pptm"): folder = filepath.replace('/', '\\') ppt.ppt_convert(folder) pdf = PDF() pdf.p(file_path_variable, output_merge.replace('/', '\\')) root.mainloop()
def gen_pdf_quiz(cfg): for user in cfg: pdf = PDF() for i in range(cfg[user]['global']['pages']): pdf.add_page() pdf.set_title('四则运算练习') pdf.set_date(date=cfg[user]['global']['show_date']) quizzes = bulk_quiz_gen(cfg[user]) pdf.set_quizzes(quizzes=quizzes) quiz_dir = cfg[user]['global']['quiz_dir'] quiz_dir = os.path.expanduser(quiz_dir) pdf_filename = os.path.join(quiz_dir, f'{user}.pdf' if user else 'quizzes.pdf') pdf.output(f'{pdf_filename}', 'F')
if len(conf.cmd_parts) == 0: arglist = ("aktoversigt", "roles", "frontpage", "props", "contacts", "material","individual", "songmanus") elif "manus" in sys.argv: arglist = ("aktoversigt", "roles", "frontpage", "props", "contacts", "material") else: arglist = sys.argv[1:] for arg in arglist: create_parts(revue, arg) if len(conf.cmd_parts) == 0 or "manus" in sys.argv: pdf = PDF() pdf.pdfmerge((os.path.join(path["pdf"],"forside.pdf"), os.path.join(path["pdf"],"aktoversigt.pdf"), os.path.join(path["pdf"],"rolleliste.pdf"), revue, os.path.join(path["pdf"],"rekvisitliste.pdf"), os.path.join(path["pdf"],"kontaktliste.pdf")), os.path.join(path["pdf"],"manuskript.pdf")) print("Manuscript successfully created!") for act in revue.acts: for material in act.materials: metadata.update_mod_time(material) for f in glob.glob(os.path.join(path["pdf"], "*.pdf")):
class CrossMgrPrintoutPDF(CrossMgrPrintout): def __init__(self, dir, fileBase, orientation, categories=None, allInOne=False): CrossMgrPrintout.__init__(self, categories) self.dir = dir self.fileBase = fileBase self.orientation = orientation self.allInOne = allInOne self.pdf = None self.lastFName = None def OnEndPrinting(self): if self.pdf and self.allInOne: if self.dir and not os.path.isdir(self.dir): os.mkdir(self.dir) fname = u'{fileBase}.pdf'.format(fileBase=self.fileBase) self.pdf.set_title( unicode(os.path.splitext(fname)[0].replace('-', ' ')).encode( 'iso-8859-1', 'ignore')) fname = os.path.join(self.dir, fname) self.pdf.output(fname, 'F') self.lastFName = fname self.pdf = None return super(CrossMgrPrintoutPDF, self).OnEndPrinting() def OnPrintPage(self, page): exportGrid = self.prepareGrid(page) category = self.pageInfo[page][0] pageNumber = self.pageInfo[page][3] pageTotal = self.pageInfo[page][4] fname = u'{fileBase}-{categoryName}.pdf'.format( fileBase=self.fileBase, categoryName=category.fullname if category != 'Primes' else 'Primes') fname = Utils.RemoveDisallowedFilenameChars(fname).replace(' ', '-') if not self.pdf: self.pdf = PDF( orientation='L' if self.orientation == wx.LANDSCAPE else 'P') self.pdf.set_font('Arial', '', 12) self.pdf.set_author( unicode(getpass.getuser()).encode('iso-8859-1', 'ignore')) self.pdf.set_keywords( unicode('CrossMgr Results').encode('iso-8859-1', 'ignore')) self.pdf.set_creator( unicode(Version.AppVerName).encode('iso-8859-1', 'ignore')) self.pdf.set_title( unicode(os.path.splitext(fname)[0].replace('-', ' ')).encode( 'iso-8859-1', 'ignore')) exportGrid.drawToFitPDF(*([self.pdf, self.orientation] + self.pageInfo[page][1:-1])) if not self.allInOne and pageNumber == pageTotal: if self.dir and not os.path.isdir(self.dir): os.mkdir(self.dir) fname = os.path.join(self.dir, fname) self.pdf.output(fname, 'F') self.lastFName = fname self.pdf = None return True
def __init__(self, font='JetBrainsMono'): self.pdf = PDF(font) self.errors = []
def pdf(request): pdf = PDF(request.display_user) return pdf.as_response()
def print_id_label(participant): competition = participant.competition license_holder = participant.license_holder bib = participant.bib name = license_holder.first_last if len(name) > 32: name = license_holder.first_last_short system_name = 'CrossMgr' inches_to_points = 72.0 # Use points at the units. page_width = 3.9 * inches_to_points page_height = 2.4 * inches_to_points pdf = PDF('L', (page_height, page_width)) pdf.set_author(RaceDBVersion) pdf.set_title('Bib Number: {}'.format(bib)) pdf.set_subject('Rider ID and Emergency Information.') pdf.set_creator(getpass.getuser()) pdf.set_keywords( 'RaceDB CrossMgr Bicycle Racing Software Database Road Time Trial MTB CycloCross' ) margin = min(page_height, page_width) / 18.0 sep = margin / 2.5 height = page_height - margin * 2.0 width = page_width - margin * 2.0 header = Rect(margin, margin, width, height / 10.0) footer_height = height / 20 footer = Rect(margin, page_height - margin - footer_height, header.width, footer_height) field = Rect(header.x, header.bottom + sep, width, footer.top - header.bottom - sep * 2) leftArrow, rightArrow = chr(172), chr(174) font_name = 'Helvetica' pdf.add_page() pdf.set_font(font_name, 'b') header.draw_text_to_fit(pdf, name, Rect.AlignLeft, True) pdf.set_font(font_name) info = [] info.append([ '', u', '.join([ u'Age: {}'.format(license_holder.get_age()), u'Gender: {}'.format(license_holder.get_gender_display()), u'Nation: {}'.format(license_holder.nation_code), ]), ]) info.append(['', '']) if participant.team: info.append(['', u'{}'.format(participant.team.name)]) info.append([ '', u', '.join([ u'Bib: {}'.format(participant.bib), u'Category: {}'.format(participant.category.code_gender if participant.category else ''), ]), ]) if license_holder.phone: info.append([ '', u' '.join([ u'Phone: {}'.format(format_phone(license_holder.phone)), ]), ]) info.append(['', '']) if license_holder.emergency_medical: info.append([ '', u'Medical Alert: {}'.format(license_holder.emergency_medical) ]) info.append(['', u'Emergency Contact:']) if license_holder.emergency_contact_name: info.append([ '', u' {}'.format(license_holder.emergency_contact_name or 'None provided') ]) info.append([ '', u' {}'.format( format_phone(license_holder.emergency_contact_phone) or 'No phone number provided') ]) pdf.table_in_rectangle(field.x, field.y, field.width, field.height, info, leftJustifyCols=[0, 1], hasHeader=False, horizontalLines=False) footer.draw_text_to_fit(pdf, system_name, Rect.AlignRight, True) pdf_str = pdf.output(dest='s') return pdf_str
class DocumentMetadataIngestModule(DataSourceIngestModule): _logger = Logger.getLogger(DocumentMetadataIngestModuleFactory.moduleName) def log(self, level, msg): self._logger.logp(level, self.__class__.__name__, inspect.stack()[1][3], msg) def __init__(self, settings): self.context = None self.totalCount = 0 self.cfbf = CFBF() self.pdf = PDF() self.ooxml = OOXML() self.xls_result = [] self.ppt_result = [] self.doc_result = [] self.xlsx_result = [] self.pptx_result = [] self.docx_result = [] self.PyPDF_result = [] self.PDFNoModule_result = [] def startUp(self, context): self.context = context pass def getTitles(self, result): titles = [] for i in result: for j in i.keys(): if j == "handle": continue titles.append(j) return list(set(titles)) def addData(self, titles, result, filetype, skCase): for title in titles: try: attID = skCase.addArtifactAttributeType( "TSK_" + filetype + "_" + str(title), BlackboardAttribute. TSK_BLACKBOARD_ATTRIBUTE_VALUE_TYPE.STRING, unicode(title)) artID_art = skCase.addBlackboardArtifactType( "TSK_" + filetype + "_DATA", filetype) except: pass getArtId = skCase.getArtifactTypeID("TSK_" + filetype + "_DATA") for i in result: art = i["handle"].newArtifact(getArtId) for title in titles: try: art.addAttribute( BlackboardAttribute( skCase.getAttributeType("TSK_" + filetype + "_" + str(title)), DocumentMetadataIngestModuleFactory.moduleName, unicode(i[title]))) except: art.addAttribute( BlackboardAttribute( skCase.getAttributeType("TSK_" + filetype + "_" + str(title)), DocumentMetadataIngestModuleFactory.moduleName, "")) def startModule(self, extension, skCase, fileManager, dataSource, progressBar): files = fileManager.findFiles(dataSource, "%." + extension) numFiles = len(files) progressBar.switchToDeterminate(numFiles) fileCount = 0 Directory = os.path.join(Case.getCurrentCase().getTempDirectory(), extension + " files") try: os.mkdir(Directory) except: pass for file in files: self.log(Level.INFO, "Processing file: " + file.getName()) fileCount += 1 self.totalCount += 1 Path = os.path.join(Directory, unicode(file.getName())) ContentUtils.writeToFile(file, File(Path)) if extension.lower() == "pdf": try: resultPDF, resultNoModule = self.pdf.run(Path) resultPDF["handle"] = file resultNoModule["handle"] = file self.PyPDF_result.append(resultPDF) self.PDFNoModule_result.append(resultNoModule) except: pass elif extension.lower() == "xlsx": try: OOXML = self.ooxml.run(Path) OOXML["handle"] = file self.xlsx_result.append(OOXML) except: pass elif extension.lower() == "pptx": try: OOXML = self.ooxml.run(Path) OOXML["handle"] = file self.pptx_result.append(OOXML) except: pass elif extension.lower() == "docx": try: OOXML = self.ooxml.run(Path) OOXML["handle"] = file self.docx_result.append(OOXML) except: pass elif extension.lower() == "xls": try: resultCFBF = self.cfbf.run(Path) resultCFBF["handle"] = file self.xls_result.append(resultCFBF) except: pass elif extension.lower() == "ppt": try: resultCFBF = self.cfbf.run(Path) resultCFBF["handle"] = file self.ppt_result.append(resultCFBF) except: pass elif extension.lower() == "doc": try: resultCFBF = self.cfbf.run(Path) resultCFBF["handle"] = file self.doc_result.append(resultCFBF) except: pass progressBar.progress(fileCount) if extension.lower() == "pdf": titles = self.getTitles(self.PyPDF_result) self.addData(titles, self.PyPDF_result, "PyPDF", skCase) titles = self.getTitles(self.PDFNoModule_result) self.addData(titles, self.PDFNoModule_result, "PDFNoModule", skCase) elif extension.lower() == "xlsx": titles = self.getTitles(self.xlsx_result) self.addData(titles, self.xlsx_result, "XLSX", skCase) elif extension.lower() == "pptx": titles = self.getTitles(self.pptx_result) self.addData(titles, self.pptx_result, "PPTX", skCase) elif extension.lower() == "docx": titles = self.getTitles(self.docx_result) self.addData(titles, self.docx_result, "DOCX", skCase) elif extension.lower() == "xls": titles = self.getTitles(self.xls_result) self.addData(titles, self.xls_result, "XLS", skCase) elif extension.lower() == "ppt": titles = self.getTitles(self.ppt_result) self.addData(titles, self.ppt_result, "PPT", skCase) elif extension.lower() == "doc": titles = self.getTitles(self.doc_result) self.addData(titles, self.doc_result, "DOC", skCase) def process(self, dataSource, progressBar): progressBar.switchToIndeterminate() skCase = Case.getCurrentCase().getSleuthkitCase() fileManager = Case.getCurrentCase().getServices().getFileManager() self.startModule("pdf", skCase, fileManager, dataSource, progressBar) self.startModule("docx", skCase, fileManager, dataSource, progressBar) self.startModule("pptx", skCase, fileManager, dataSource, progressBar) self.startModule("xlsx", skCase, fileManager, dataSource, progressBar) self.startModule("doc", skCase, fileManager, dataSource, progressBar) self.startModule("ppt", skCase, fileManager, dataSource, progressBar) self.startModule("xls", skCase, fileManager, dataSource, progressBar) message = IngestMessage.createMessage( IngestMessage.MessageType.DATA, "DocumentMetadataParser", "Found %d files" % self.totalCount) IngestServices.getInstance().postMessage(message) return IngestModule.ProcessResult.OK
outlines, next_outlines = itertools.tee(outlines, 2) next_outlines = itertools.chain(itertools.islice(next_outlines, 1, None), [None]) return outlines, next_outlines if __name__ == "__main__": import get_pdf from test_cases import test_cases from get_data import HKEX_API from helper import write_to_csv, n_yearsago, today # logging.basicConfig(level=logging.INFO) # query = HKEX_API(from_date=n_yearsago(n=1), to_date=today()) # for data in query.data: # result = {} # url = data.file_link # pdf = PDF(url) # print(url) # f = TableOfContent(pdf.pdf_obj) # print() # result['result'] = f.search_outline_page_range(TableOfContent.auditor_remunration_regex) # result['url'] = url # write_to_csv(result, 'result_2.csv') url = 'https://www1.hkexnews.hk/listedco/listconews/sehk/2020/0813/2020081300777.pdf' url = 'https://www1.hkexnews.hk/listedco/listconews/sehk/2020/0813/2020081300670.pdf' pdf = PDF(url) print(url) f = TableOfContent(pdf.pdf_obj) print(f.search_outline_page_range(TableOfContent.audit_fee_regex))
if "--tex-all" in sys.argv: conf["TeXing"]["force TeXing of all files"] = "yes" revue = cr.Revue.fromfile("aktoversigt.plan") path = revue.conf["Paths"] conv = cv.Converter() if len(conf.cmd_parts) == 0: arglist = ("aktoversigt", "roles", "frontpage", "props", "contacts", "material", "individual", "songmanus") elif "manus" in sys.argv: arglist = ("aktoversigt", "roles", "frontpage", "props", "contacts", "material") else: arglist = sys.argv[1:] create_parts(revue, arglist) if len(conf.cmd_parts) == 0 or "manus" in sys.argv: pdf = PDF() pdf.pdfmerge( ((os.path.join(path["pdf"], "forside.pdf"), "Forside"), (os.path.join(path["pdf"], "aktoversigt.pdf"), "Aktoversigt"), (os.path.join(path["pdf"], "rolleliste.pdf"), "Rolleliste"), revue, (os.path.join(path["pdf"], "rekvisitliste.pdf"), "Rekvisitliste"), (os.path.join(path["pdf"], "kontaktliste.pdf"), "Kontaktliste")), os.path.join(path["pdf"], "manuskript.pdf")) print("Manuscript successfully created!")
def doExport( self, event=None ): race = Model.race if not race: return fileName = Utils.getMainWin().fileName if Utils.getMainWin() else 'Test.cmn' #--------------------------------------------------------------------------------- # Create an Excel file. # xlFileName = os.path.splitext(fileName)[0] + '-TeamResults.xlsx' try: wb = xlsxwriter.Workbook( xlFileName ) formats = ExportGrid.ExportGrid.getExcelFormatsXLSX( wb ) ues = Utils.UniqueExcelSheetName() for category in race.getCategories( publishOnly=True ): eg = self.toExportGrid( category ) if eg: ws = wb.add_worksheet( ues.getSheetName(category.fullname) ) eg.toExcelSheetXLSX( formats, ws ) wb.close() except Exception as e: logException( e, sys.exc_info() ) del wb #--------------------------------------------------------------------------------- # Create a PDF file. # pdfFileName = os.path.splitext(fileName)[0] + '-TeamResults.pdf' try: pdf = PDF( orientation = 'P' ) pdf.set_font( 'Arial', '', 12 ) pdf.set_author( getpass.getuser() ) pdf.set_keywords( 'CrossMgr Team Results' ) pdf.set_creator( Version.AppVerName ) pdf.set_title( os.path.splitext(pdfFileName)[0].replace('-', ' ') ) for category in race.getCategories( publishOnly=True ): eg = self.toExportGrid( category ) if eg: eg.drawToFitPDF( pdf, orientation=wx.PORTRAIT ) pdf.output( pdfFileName, 'F' ) except Exception as e: logException( e, sys.exc_info() ) del pdf
def vectorize(self, path, ignore_if_html_file_exists=True, ignore_errors=False, num_threads=1): """ :type path: str or Path :type ignore_if_html_file_exists: bool :type ignore_errors: bool :rtype: DataFrame """ path = Path(path) pdf_paths = [ PDF(file_path) for file_path in path.list(show_size=False) if file_path.extension.lower() == 'pdf' ] exceptions = [] # create htmls for pdf_path in iterate(iterable=pdf_paths, text='converting pdfs to html'): try: pdf_path.convert_to_html( ignore_if_exists=ignore_if_html_file_exists) except Exception as e: if ignore_errors: exceptions.append(e) else: raise e # get paragraphs def extract_paragraphs(pdf_path): try: pdf_paragraphs = pdf_path.paragraphs num_paragraphs = len(pdf_paragraphs) return [{ 'pdf': pdf_path, 'paragraph_num': i + 1, 'paragraph': paragraph, 'num_paragraphs': num_paragraphs } for i, paragraph in enumerate(pdf_paragraphs)] except Exception as e: if ignore_errors: return [{'error': e}] else: raise e if num_threads == 1: paragraph_dict_lists = [ extract_paragraphs(x) for x in iterate( pdf_paths, text='extracting paragraphs (single-threaded)') ] else: processor = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem') paragraph_dict_lists = processor( delayed(extract_paragraphs)(pdf_path=x) for x in iterate( pdf_paths, text='extracting paragraphs (multi-threaded)')) paragraph_dicts = [ x for paragraph_dict_list in paragraph_dict_lists for x in paragraph_dict_list ] # create vectors def get_vector_and_num_tokens(paragraph_dict): try: pdf_path = paragraph_dict['pdf'] paragraph_num = paragraph_dict['paragraph_num'] paragraph = paragraph_dict['paragraph'] num_paragraphs = paragraph_dict['num_paragraphs'] vector, num_tokens = self._bert_vectorizer.vectorize( text=paragraph, get_num_tokens=True) vector_df = DataFrame( vector, columns=[f'bert_{i + 1}' for i in range(vector.shape[1])]) vector_df['pdf'] = pdf_path.name_and_extension vector_df['num_paragraphs'] = num_paragraphs vector_df['paragraph_num'] = paragraph_num vector_df['num_tokens'] = num_tokens return vector_df except Exception as e: if ignore_errors: return e else: raise e if num_threads == 1: vectors = [ get_vector_and_num_tokens(paragraph_dict=x) for x in iterate( paragraph_dicts, text='converting paragraphs to vectors (single-threaded)') ] else: processor = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem') vectors = processor( delayed(get_vector_and_num_tokens)(paragraph_dict=x) for x in iterate( paragraph_dicts, text='converting paragraphs to vectors (multi-threaded)')) return bring_to_front( data=concat(vectors), columns=['pdf', 'paragraph_num', 'num_paragraphs', 'num_tokens']).reset_index(drop=True)
class CrossMgrPrintoutPDF( CrossMgrPrintout ): def __init__( self, dir, fileBase, orientation, categories = None, allInOne = False ): CrossMgrPrintout.__init__(self, categories) self.dir = dir self.fileBase = fileBase self.orientation = orientation self.allInOne = allInOne self.pdf = None self.lastFName = None def OnEndPrinting(self): if self.pdf and self.allInOne: if self.dir and not os.path.isdir( self.dir ): os.mkdir( self.dir ) fname = u'{fileBase}.pdf'.format( fileBase=self.fileBase ) self.pdf.set_title( unicode(os.path.splitext(fname)[0].replace('-', ' ')).encode('iso-8859-1','ignore') ) fname = os.path.join( self.dir, fname ) self.pdf.output( fname, 'F' ) self.lastFName = fname self.pdf = None return super(CrossMgrPrintoutPDF, self).OnEndPrinting() def OnPrintPage( self, page ): exportGrid = self.prepareGrid( page ) try: category = self.pageInfo[page][0] except Exception as e: # Handle case of no data. return True pageNumber = self.pageInfo[page][3] pageTotal = self.pageInfo[page][4] fname = u'{fileBase}-{categoryName}.pdf'.format( fileBase = self.fileBase, categoryName = category.fullname if category != 'Primes' else 'Primes' ) fname = Utils.RemoveDisallowedFilenameChars( fname ).replace( ' ', '-' ) if not self.pdf: self.pdf = PDF( orientation = 'L' if self.orientation == wx.LANDSCAPE else 'P' ) self.pdf.set_font( 'Arial', '', 12 ) self.pdf.set_author( unicode(getpass.getuser()).encode('iso-8859-1','ignore') ) self.pdf.set_keywords( unicode('CrossMgr Results').encode('iso-8859-1','ignore') ) self.pdf.set_creator( unicode(Version.AppVerName).encode('iso-8859-1','ignore') ) self.pdf.set_title( unicode(os.path.splitext(fname)[0].replace('-', ' ')).encode('iso-8859-1','ignore') ) exportGrid.drawToFitPDF( *([self.pdf, self.orientation] + self.pageInfo[page][1:-1]) ) if not self.allInOne and pageNumber == pageTotal: if self.dir and not os.path.isdir( self.dir ): os.mkdir( self.dir ) fname = os.path.join( self.dir, fname ) self.pdf.output( fname, 'F' ) self.lastFName = fname self.pdf = None return True
return f'{self.__class__.__name__} - {self.section}' if __name__ == '__main__': from hkex_api import HKEX_API # https://www1.hkexnews.hk/listedco/listconews/gem/2020/0929/2020092901098.pdf #concat number # https://www1.hkexnews.hk/listedco/listconews/sehk/2020/0929/2020092900604.pdf #concat number query = HKEX_API() urls = [data.file_link for data in query.get_data()] # urls = ['https://www1.hkexnews.hk/listedco/listconews/sehk/2020/0923/2020092300374.pdf'] for url in urls: # url = data.file_link # url, p = 'https://www1.hkexnews.hk/listedco/listconews/sehk/2020/0721/2020072100713.pdf', 61 # url, p = 'https://www1.hkexnews.hk/listedco/listconews/sehk/2020/0721/2020072100653.pdf', 94 print(url) pdf = PDF.create(url) corp_gov_report = pdf.get_outline(CorporateGovReport.title_regex) if not corp_gov_report: continue corp_gov_report = CorporateGovReport.create(corp_gov_report[0]) if not corp_gov_report: continue if not corp_gov_report.audit_fee: continue try: page = corp_gov_report.audit_fee.pages[0] sec = corp_gov_report.audit_fee.sections[0] table = corp_gov_report.audit_fee.tables[0] except Exception as e: print(e) continue
text = ''.join(i for i in temp if isinstance(i,basestring)) return text password = '' print sys.argv fp = open(sys.argv[1], 'rb') parser = PDFParser(fp) document = PDFDocument(parser) document.initialize(password) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pdf = PDF(sys.argv[1],verbose=True) page_no = 1 for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for group in layout.groups: rect = [group.x0,group.y0,group.x1,group.y1] quad = [ group.x0,group.y0, group.x1,group.y0, group.x1,group.y1, group.x0,group.y1 ] pdf.add_annot_to_page(page_no,quad,rect)