def reportAddendum(reportFile, listofOutputsTobeAdded, listOfComments=None): """ Enables to add figures to a report in an automated way. The changes intervine after the last chapter of the report. """ if ".docx" in reportFile: document = Document(reportFile) else: raise Warning(" problem with the format of the file") paragraph_styles = [s for s in document.styles if s.type == WD_STYLE_TYPE.PARAGRAPH] postProcessingHeader = document.add_paragraph("PostProcessing", style=paragraph_styles[1]) postProcessingText = document.add_paragraph(style=paragraph_styles[0]) postProcessingText.add_run("\rWord format postprocessing generated through the ") postProcessingText.add_run("etumos ").bold = True postProcessingText.add_run("coupling tool\r\r") # document.add_heading('Outputs', level=1) ind = 0 for picture in listofOutputsTobeAdded: document.add_picture(picture, width=Cm(7.00)) document.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER newParagraph = document.add_paragraph("\r") newParagraph.add_run("Figure n. " + str(ind) + ": ").bold = True if listOfComments != None: newParagraph.add_run(str(listOfComments[ind])) ind += 1 newParagraph.add_run(" \r \r") newParagraph.alignment = WD_ALIGN_PARAGRAPH.CENTER reportFile = reportFile.replace(".docx", "") filenameToBeSaved = reportFile + "_pp.docx" document.save(filenameToBeSaved) return None
def create_review_form(submission, review_form): document = Document() document.add_heading(submission.title, 0) p = document.add_paragraph('You should complete this form and then use the review page to upload it.') relations = models.FormElementsRelationship.objects.filter(form=review_form).order_by('order') for relation in relations: if relation.element.field_type in ['text', 'textarea', 'date', 'email']: document.add_heading(relation.element.name+": _______________________________", level=1) document.add_paragraph(relation.help_text).italic = True if relation.element.field_type in ['select', 'check']: document.add_heading(relation.element.name, level=1) if relation.element.field_type == 'select': choices = render_choices(relation.element.choices) else: choices = ['Y', 'N'] p = document.add_paragraph(relation.help_text) p.add_run(' Mark your choice however you like, as long as it is clear.').italic = True table = document.add_table(rows=2, cols=len(choices)) hdr_cells = table.rows[0].cells for i, choice in enumerate(choices): hdr_cells[i].text = choice[0] table.style = 'TableGrid' document.add_page_break() if not os.path.exists(os.path.join(settings.BASE_DIR, 'files', 'forms')): os.makedirs(os.path.join(settings.BASE_DIR, 'files', 'forms')) path = os.path.join(settings.BASE_DIR, 'files', 'forms', '%s.docx' % str(uuid4())) document.save(path) return path
def WriteTiezi(self): if len(self.pages)==0: print 'Error!' document = Document() # style = document.StyleSheet # style.Fonts.append(Font("\\'cb\\'ce\\'cc\\'e5", 'modern', 134, 30)) # section = Section() # document.Sections.append(section) # tps = TextPS(font=getattr(style.Fonts, "\\'cb\\'ce\\'cc\\'e5")) for i in range(self.page_num): now_page = self.pages[i] for a in now_page: for b in a: document.add_paragraph(b.decode()) if len(a)>1: # pic_num = len(self.img[a[0]]) if self.img.has_key(a[0]): for k in self.img[a[0].decode()]: pic_name = self.getImg(k) document.add_picture(self.dirname + '/' + pic_name) document.add_paragraph('---------------------------------') name = self.url.strip().split('/') name = name[-2] + name[-1] document.save(self.dirname + '/' + name + '.docx') print "Success to dump into " + name + '.docx'
def save2Word(): file_name = './word/'+title+'.docx' with open('./txt/mainContent.txt','a') as f1: with open('./txt/comment.txt','rb') as f2: f1.write(f2.read()) with open('./txt/mainContent.txt','rb') as article_complete: document = Document() for line in article_complete: if len(line) == 7 or len(line) == 13 or line == '\n':continue #手动换行符暂时解决不了 img_url = re.findall(r'(http:.*?\.jpg|http:.*?\.png|http:.*?\.gif)',line,re.S) line = re.sub(r'(http:.*?\.jpg|http:.*?\.png|http:.*?\.gif)','',line) for img in img_url: imgName = saveImg(img) if imgName != 0: try: document.add_picture('./img/'+imgName, width=Inches(3.25)) except Exception: document.add_paragraph(u'图片加载失败....') document.add_paragraph(img) else: document.add_paragraph(u'图片加载失败....') document.add_paragraph(img) document.add_paragraph(line.decode('UTF-8')) document.save(file_name.decode('UTF-8'))
class DocXRenderer(mistune.Renderer): def __init__(self, *largs, **kargs): self.doc = Document() self.clist = None super(DocXRenderer, self).__init__(*largs, **kargs) def list_item(self, text): return "{}\n".format(text) def list(self, body, ordered=True): if ordered: style = "ListNumber3" else: style = "ListBullet" for i in body.rstrip().split("\n"): self.doc.add_paragraph(i, style) return '' def header(self, text, level, raw=None): self.doc.add_heading(text, level) return '' def paragraph(self, text): self.doc.add_paragraph(text, style=None) return ''
def convert_pdf(path='provide path here', format='text', codec='utf-8'): rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() if format == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) else: raise ValueError('Please provide the format to extract') fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 500 #mention the maximum pages here (Note: Large number of pages will decrease the performance.) caching = True page_numbers=set() for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue().decode() fp.close() device.close() retstr.close() bulletins_data = re.findall('•([^•]+)*', str(text)) list_of_bullet_points = [] json_dict = {} for points in bulletins_data: list_of_bullet_points.append(points) json_dict['bulletins'] = list_of_bullet_points json_data= json.dumps(json_dict) parsed = json.loads(json_data) final_data = json.dumps(parsed, indent=4, sort_keys=True) #creates a pretty json with the data extracted document = Document() # creates a new document document.add_heading('Bulletins data in the PDF') document.add_paragraph(str(final_data)) document.save('json_data.docx') # saves it to the filesystem os.startfile("json_data.docx") # will open the file return ''
class XQHTMLParser(HTMLParser): def __init__(self, docfile): HTMLParser.__init__(self) self.docfile = docfile self.doc = Document(docfile) self.myclient = HTMLClient() self.text = '' self.title = False self.isdescription = False self.picList=[] def handle_starttag(self, tag, attrs): #print "Encountered the beginning of a %s tag" % tag self.title = False self.isdescription = False if re.match(r'h(\d)', tag): self.title = True if tag == "img": if len(attrs) == 0: pass else: for (variable, value) in attrs: if variable == "src": picdata = self.myclient.GetPic(value.split('!')[0]) if picdata == None: pass else: pictmp = value.split('/')[-1].split('!')[0] picfix = value.split('/')[-1].split('!')[-1] with open(pictmp, 'wb') as pic: pic.write(bytes(picdata)) pic.close() #if os.path.getsize(pictmp) < 90000: try: if picfix[0:1] == 'c': self.doc.add_picture(pictmp, width=Inches(4.5)) else: self.doc.add_picture(pictmp)#, width=Inches(2.25)) except docx.image.exceptions.UnexpectedEndOfFileError as e: print(e) self.picList.append(pictmp) if tag == 'script': self.isdescription = True def handle_data(self, data): if self.title == True: if self.text != '': self.doc.add_paragraph(self.text) self.text = '' self.doc.add_heading(data, level=2) if self.isdescription == False: self.text += data def handle_endtag(self, tag): #if tag == 'br' or tag == 'p' or tag == 'div': if self.text != '': self.doc.add_paragraph(self.text) self.text = '' def complete(self, html): self.feed(html) self.doc.save(self.docfile) for item in self.picList: if os.path.exists(item): os.remove(item)
def create_docx(path, key, filename='pics', width=None, fontsize=9): from docx import Document from docx.shared import Inches from docx.shared import Pt files=listdir(path) file_list=[] for i in files: if i.find(key) is not -1: print ' %s is found' %i file_list.append(i) doc = Document() para = doc.add_paragraph() run = para.add_run() font = run.font font.name = 'Calibri' font.size = Pt(fontsize) if width is None: width=5.5/len(file_list) for pic in file_list: short_path=path.split('/')[-3]+'/'+path.split('/')[-2]+'/'+path.split('/')[-1] run.add_text(short_path+pic+': \n') para = doc.add_paragraph() run = para.add_run() for pic in file_list: run.add_text(' \n') run.add_picture(path+pic, width= Inches(width)) doc.save_prjt('%s.docx' % filename) print ' save %s.docs' %filename
def get_brief_name(expression, con): # Fetch all kinds of object_name from all_objects find_object_info = '''select OBJECT_TYPE, OBJECT_NAME, OWNER from ALL_OBJECTS where regexp_like(OBJECT_NAME ,:expression)''' cur = con.cursor() cur.execute(find_object_info, {'expression': str(expression)}) object_info = cur.fetchall() view_name = Document() index_name = Document() package_name = Document() trigger_name = Document() sequence_name = Document() for res in object_info: # print res if str(res[0]) == 'VIEW': view_name.add_paragraph(str(res[1])) if str(res[0]) == 'INDEX': index_name.add_paragraph(str(res[1])) if str(res[0]) == 'PACKAGE': package_name.add_paragraph(str(res[1])) if str(res[0]) == 'TRIGGER': trigger_name.add_paragraph(str(res[1])) if str(res[0]) == 'SEQUENCE': sequence_name.add_paragraph(str(res[1])) view_name.save('ViewName.docx') index_name.save('IndexName.docx') package_name.save('PackageName.docx') trigger_name.save('TriggerName.docx') sequence_name.save('SequenceName.docx')
def WriteDoc(self, song_title_list, song_lyric_dict): document = Document("template.docx") for index, title in enumerate(song_title_list): #p = document.add_paragraph(TITLE_PREFIX + title).bold = True p = "" if index == 0: p = document.paragraphs[0] else: p = document.add_paragraph() run = p.add_run(TITLE_PREFIX + title) font = run.font font.bold = True #font.name = 'SimSun' font.name = 'Microsoft YaHei' font.size = doc_Pt(12) #p = document.add_paragraph() song_lyric_paragraph_list = song_lyric_dict.get(title) if song_lyric_paragraph_list == None: continue for song_lyric_paragraph in song_lyric_paragraph_list: for song_lyric_ in song_lyric_paragraph: p = document.add_paragraph() run = p.add_run(song_lyric_) font = run.font font.size = doc_Pt(12) p = document.add_paragraph() document.save(today + '.docx') return
def generate_documents(recipes): create_output_dir() for recipe in recipes: doc = Document() heading_r = doc.add_paragraph().add_run(recipe.title.string) heading_r.font.size = Pt(14) heading_r.font.bold = True heading_r.font.underline = True ingredients_p = doc.add_paragraph() ingredients_r = ingredients_p.add_run("Zutaten") ingredients_r.font.size = Pt(13) ingredients_r.font.bold = True for ingredient in recipe.ingredients: ingredient_li_p = doc.add_paragraph(style = "ListBullet") ingredient_li_p.add_run(ingredient).font.size = Pt(13) instructions_p = doc.add_paragraph() instructions_r = instructions_p.add_run("Zubereitung") instructions_r.font.size = Pt(13) instructions_r.font.bold = True for text in recipe.texts: text_r = doc.add_paragraph().add_run(text) text_r.font.size = Pt(13) doc.save(os.path.join(output_dir, recipe.title.string + ".docx"))
def _save_results(filename, abstracts): document = Document() for title, content in abstracts: document.add_heading(title, level=0) for p in content.split("\r\n"): document.add_paragraph(p) document.add_page_break() document.save("./result/" + filename)
def deployaword (self, texto, nombre): document = Document() for n in texto: document.add_paragraph( str(n), style='ListBullet') document.save(nombre+'.docx')
def redact(redacters, doc): redacted = Document() for p in doc.paragraphs: p_r = p.text.lower() for r in redacters: p_r = re.sub(r, '[REDACTED]', p_r) redacted.add_paragraph(p_r) return redacted
def writeDoc(title,content): docName = title+'.docx' document = Document() document.add_heading(title, 0) document.add_paragraph(content) document.save(docName)
def get(name): saveFile = os.getcwd()+"/"+name try: doc = Document() doc.add_heading("HEAD".decode('gbk'),0) doc.add_paragraph("HEAD CONTENT".decode('gbk')) doc.save(saveFile) except : print("error")
def writeresult(self, vm_mtu): ori_path, extract_path = self.create_file(vm_mtu) ori_docx = Document() extract_xlsx = Workbook() self.create_excel_sheet(extract_xlsx) column = 1 self.write_ori_head(ori_docx) s = HostProcess(CONF[self.pair], vm_mtu, self.output_q, 'server', self.s_cmd_q, self.s_result, self.s_check_q) c = HostProcess(CONF[self.pair], vm_mtu, self.output_q, 'client', self.c_cmd_q, self.c_result) s.start() c.start() run_cmd_times = 0 for item in CONF.cmd_list: ori_docx.add_paragraph(item.upper(), style='Heading 4') s_cmd = self.create_cmd('server', item) c_cmd = self.create_cmd('client', item) for seq in range(CONF.test_times): self.s_cmd_q.put(s_cmd) while not self.s_cmd_q.empty(): time.sleep(0.2) self.s_check_q.get() time.sleep(2) self.c_cmd_q.put(c_cmd) try: s_result = self.s_result.get(timeout=240) c_result = self.c_result.get(timeout=240) except: self.kill_process(CONF[self.pair], 'server') s_result = self.s_result.get(timeout=240) c_result = self.c_result.get(timeout=240) self.output_q.put('wait all cmd run complete') self.write_ori_to_docx(ori_docx, seq, s_cmd, c_cmd, s_result, c_result) extracted_result = self.extract_result(c_result) ws = extract_xlsx.get_sheet_by_name(u'第{}组数据'.format(str(seq+1))) for index, result_cell in enumerate(extracted_result): self.output_q.put(extracted_result) self.output_q.put(result_cell) ws.cell(row=2, column=column+index, value=result_cell) all_result = self.collect_all_result.get() all_result[seq][self.num][column+index] = result_cell self.collect_all_result.put(all_result) run_cmd_times += 1 self.output_q.put(run_cmd_times) self.all_cmd_q.put('cmd_run') while self.all_cmd_q.qsize() != run_cmd_times * self.pair_num: time.sleep(0.3) time.sleep(3) column += len(extracted_result) ori_docx.save(ori_path) extract_xlsx.save(extract_path) self.output_q.put('save_xlsx') self.s_cmd_q.put('over') self.c_cmd_q.put('over') s.join() c.join() self.output_q.put('over')
def writeDoc(docTitle, feed, fileName): # Setting up word doc for file writing document = Document() document.add_heading(fileName, 0) # Writing it out document.add_paragraph(unicode(feed, 'utf-8')) try : document.save('I:\dev_JC\_Python\Data_Pull\WD\\' + fileName + '.docx') except IOError: document.save('I:\dev_JC\_Python\Data_Pull\WD\\' + "Bad_Name" + '.docx')
def substituteVariablesDocx(config, file_name_in, fileNameOut, subs): c = Context(subs) doc_in = Document(docx=file_name_in.replace("/./", "/").replace("\\.\\", "\\").replace("\\", "/")) doc_temp = Document() paras=doc_in.paragraphs fullText="" i = 0 styles = {} for para in paras: paraText="" p = doc_temp.add_paragraph(style = para.style) docx_copy_para_format_from(p, para) j = 0 runs = para.runs for run in runs: txt = run.text paraText+= txt+"+"+str(j)+"+run+" r = p.add_run(text = txt, style=run.style) docx_copy_run_style_from(r, run) j+=1 fullText+= paraText+str(i)+"+para+" i+=1 fullText = preprocess(fullText) t = get_engine(config).from_string(fullText) xtxt = t.render(c) xtxt = apply_sequence(xtxt) xParaTxts = xtxt.split("+para+") for p in paras: removePara(p) doc_in.paragraphs.clear() paras=doc_temp.paragraphs for xParaTxt in xParaTxts: runTxts = xParaTxt.split("+run+") if runTxts[-1]!='': para_n = int(runTxts[-1]) p = doc_in.add_paragraph(style=paras[para_n].style) docx_copy_para_format_from(p, paras[para_n]) for runTxt in runTxts[:-1]: try: txt = runTxt.split("+")[-2] except: txt="" run_n = int(runTxt.split("+")[-1]) r = p.add_run(text=txt, style=paras[para_n].runs[run_n].style) docx_copy_run_style_from(r, paras[para_n].runs[run_n]) if isControlLine(paras[para_n].text): p.text="{}" for p in doc_in.paragraphs: if p.text=="{}": removePara(p) doc_in.save(fileNameOut) return {"file":fileNameOut}
class HoroDocument(): def generate(self, content): self.doc = Document() for item in content: self.doc.add_heading(item['section_title'], level=1) for para in item['paragraph']: self.doc.add_heading(para['title'], level=3) self.doc.add_paragraph(para['content']) def save(self, filename): self.doc.save(filename)
def make_entry(res): # pull out the target notebook date from the regex date = dateutil.parser.parse(res.match.group('date')) logging.debug("new notebook target date: %s", date) # query the database for announcements from the specified date # TODO: build a new brain api and refactor? announcements = res.robot.brain.db.search((where('plugin') == 'notebook') & (where('type') == 'announcement') & (where('date') == date.isoformat()) ) logging.debug("announcements are %s", announcements) # if there actually _are_ announcements for that date if announcements != []: document = None if res.robot.config['plugin_config']['notebook']['append']: document = Document(res.robot.config['plugin_config']['notebook']['file']) else: # create a new docx document object document = Document() # fill in a bunch of boilerplate document.add_page_break() document.add_heading('{date}, the BEC'.format(date=date.strftime('%m/%d/%Y')), level=1) document.add_heading('Announcements:', level=2) # pull out a list of all the users that had announced users = set(map(lambda a: a['user'], announcements)) logging.debug("users are %s", users) # for each user who has announced for user in sorted(users): # get their real name real_name = res.robot.slack_client.server.users.find(user).real_name logging.debug("announcing user %s is %s", user, real_name) # create a new heading for them document.add_paragraph("{}:".format(real_name)) # for every one of their posts for announcement in announcements: if announcement['user'] == user: # create a bullet-point for that announcement document.add_paragraph("{}".format(announcement['announcement']), style='ListBullet') # TODO: onedrive document.save(res.robot.config['plugin_config']['notebook']['file']) else: # let the user know that that meeting date doesn't exist res.reply(res.msg.user, "No announcements for date {}".format(date.strftime('%m/%d/%Y')))
def doit(): host = '192.168.1.254' user = '******' password = '******' database = 'renrenbx' mysql_init(mysql_host = host, mysql_database = database, mysql_user = user, mysql_password = password) tables = {} for column in table_schema(database): tables[column['TABLE_NAME']] = {'info':column,'columns':[]} for column in table_colume(database): tables[column['TABLE_NAME']]['columns'] += [column] document = Document() document.add_heading(database, 0) i = 0 max = len(tables) for key in sorted(tables.keys()): i = i + 1 value = int(round((i * 1.0) / max * 100)) sys.stdout.write(' [' + '#' * i + '] %s%%' % value + '\r') sys.stdout.flush() document.add_heading(key, 1) table_engine = tables[key]['info']['ENGINE'] paragraph = document.add_paragraph() paragraph.add_run(table_engine).bold = True table_comment = tables[key]['info']['TABLE_COMMENT'] paragraph = document.add_paragraph() paragraph.add_run(table_comment if table_comment else u'无注释').bold = True table = document.add_table(rows = 1, cols = 4) hdr_cells = table.rows[0].cells hdr_cells[0].text = u'字段' hdr_cells[1].text = u'主键' hdr_cells[2].text = u'类型' hdr_cells[3].text = u'注释' for column in tables[key]['columns']: row_cell = table.add_row().cells row_cell[0].text = column['COLUMN_NAME'] row_cell[1].text = column['COLUMN_KEY'] if column['COLUMN_KEY'] else '-' row_cell[2].text = column['COLUMN_TYPE'] row_cell[3].text = column['COLUMN_COMMENT'] if column['COLUMN_COMMENT'] else '-' document.save('%s-%s.docx' % (database,datetime.datetime.now().strftime("%Y%m%d%H")))
def finish_report(self, arg, text, extra): easygui.msgbox("Reached", title="Warning") remove = set(self.lst) sorted_list = list(remove) document = Document() headings = ['All Files', 'File System Info', 'Deleted Files', 'Deleted Folders', 'Partition Info'] for entry in sorted_list: print entry document.add_heading(headings[entry]) document.add_paragraph(self.storage_array[entry + 1]) document.save('test.docx')
def generatedocx(logs, quarter): # This function will generate the .docx file for the report, given the list # of logs in the form of the named tuple. #create the document report = Document() logs.sort(key=lambda tup: tup[2]) testlogs = {} for log in logs: if log.psa not in testlogs: testlogs[log.psa] = {} if log.date not in testlogs[log.psa]: testlogs[log.psa][log.date] = [] testlogs[log.psa][log.date].append(log.time) #Generate the title if date.today().month == 1: report.add_heading('KTEQ-FM Quarterly Issues Report Q' + str(quarter) + ' '+ str(date.today().year-1), level=0) else: report.add_heading('KTEQ-FM Quarterly Issues Report Q' + str(quarter) + ' '+ str(date.today().year), level=0) report.add_paragraph('This document is the quarterly Community Issues Report for KTEQ-FM. It details a number of community issues discussed during programming throughout the quarter, and lists public service announcements that support these issues. This list contains all of the public service announcements played on air by live DJs. For a complete list, including automated public service announcements, contact KTEQ-FM management at [email protected]') table = report.add_table(rows=1, cols=3) hdr = table.rows[0].cells hdr[0].text = 'Date Played' hdr[1].text = 'Time Played' hdr[2].text = 'PSA Title' for entry in logs: row = table.add_row().cells row[0].text = str(entry.date) row[1].text = str(entry.time) row[2].text = entry.psa '''for psa in testlogs: report.add_heading(psa, level=1) table = report.add_table(rows=len(testlogs[psa]), cols=2) for psadate in testlogs[psa]: row = table.add_row().cells row[0].text = str(psadate) row[1].text = str(testlogs[psa][psadate])''' reportdir = 'reports/' + str(date.today().year) if not os.path.exists(reportdir): os.makedirs(reportdir) outputfile = reportdir + '/Q' + str(quarter) + '_Issues.docx' report.save(outputfile)
def OnButton4Button(self, event): path = os.getcwd() vs.tr1300.SCPI.MMEMory.STORe.IMAGe = path + '\\temp.png' document = Document() document.add_heading('TEST REPORT',0) document.add_heading( u'品名' , level=1) document.add_paragraph(self.textCtrl1.Value, style='IntenseQuote') document.add_heading(u'型號', level=1) document.add_paragraph( self.textCtrl2.Value, style='IntenseQuote') document.add_picture(path + '\\temp.png', width=Inches(6)) a = time.localtime() s = '%04d/%02d/%02d %02d:%02d:%02d' % (a[0],a[1],a[2],a[3],a[4],a[5]) document.add_paragraph(s) t = path + '\\%04d%02d%02d_%02d%02d%02d.docx' % (a[0],a[1],a[2],a[3],a[4],a[5]) document.add_paragraph('Test by ' + self.choice1.GetString(self.choice1.GetSelection())) document.save(t) vs.tm += 1 if vs.tm == 10: wx.MessageBox('The Demo program only can run 10 times.','Up to limit!', style = wx.CENTER | wx.ICON_WARNING | wx.OK) self.Close() self.Destroy() event.Skip()
def write_test_result_report_word(result_report): image_path = sys_tools.base_path + '\\auto_results\\screenshots\\' + result_report.screen_shot word_result_path = sys_tools.base_path + '\\auto_results\\test_results\\' + settings.test_result_file_name_word # image_path = '../auto_results/screenshots/' + result_report.screen_shot # word_result_path = '../auto_results/test_results/' + settings.test_result_file_name_word if os.path.exists(word_result_path): document = Document(word_result_path) else: document = Document() document.add_heading(result_report.report_name + ': ' + result_report.report_test_result.__str__() + '-' + result_report.browser.__str__(), level=1) document.add_paragraph(text='Test Result: ' + result_report.msg) document.add_picture(image_path, width=Inches(4.5)) document.save(word_result_path)
def exportPayInfo(self): """ 导出出付费信息 """ title = '%s[%s]' % (self.username, datetime.strftime(datetime.now(), "%Y-%m-%d")) reserveInfo = pitcher.getReserveInfo() c1 = reserveInfo[u'状态'] == u'未确认' c2 = reserveInfo[u'最后确认时间'].apply(parser.parse) > datetime.now() reserveInfo = reserveInfo[c1 & c2] document = Document() document.add_heading(title) for i, row in reserveInfo.iterrows(): document.add_heading(u'票项%d' % (i + 1), level=1) document.add_paragraph(text=u'航线: ' + row[u'航线']) document.add_paragraph(text=u'航班时间: ' + row[u'航班时间']) document.add_paragraph(text=u'人数: ' + row[u'人数']) document.add_paragraph(text=u'金额: ' + row[u'金额']) document.add_paragraph(text=u'最后确认时间: ' + row[u'最后确认时间']) filename = tempfile.mktemp(suffix='.jpg',prefix='tmp_') with open(filename, 'wb') as f: orderNumber = pitcher.getOrderNumber(row[u'预订ID']) qrcode = pitcher.getWeixinPayQrcode(orderNumber) f.write(qrcode) document.add_picture(filename, width=Inches(1)) time.sleep(self.normalWaitingSecond) filename = tempfile.mktemp(suffix='.docx',prefix=title + '_') document.save(filename) # 发送邮件 send_mail(settings.MAIL_LIST, title, u'见附件', [filename])
def _write_docx(self, path, content): """ Write a .docx file. Args: path (str): The file path. content (str): The file content. """ docx = Document() docx.add_paragraph(content) docx.core_properties.created = datetime.now() docx.save(path)
def transclusionDOCX(templatefilepath, resultDocumentDirectoryPath, dataEntities): if not os.path.isfile(templatefilepath): return False TransclusionId = uuid.uuid1(); TransclusionDocumentNumber = 1; for dataEntity in dataEntities: resultDocumentPath = "{0}\\{1}_{2}.docx".format(resultDocumentDirectoryPath, TransclusionId, TransclusionDocumentNumber) document = Document() document.add_heading(dataEntity["id"], 0) with open(templatefilepath,"r") as template_file: for line in template_file: document.add_paragraph(line.format(**dataEntity)) document.save(resultDocumentPath) TransclusionDocumentNumber+=1 return True
def create_word_document(): """Creates a word document containing job data""" document = Document() style=document.styles['Normal'] style.font.name = 'Calibri' style.font.size = Pt(12) def add_line(label, value): """Adds the paragraph with format of 'label: value[bolded]'""" para = document.add_paragraph() para_label = para.add_run(label + ": ") para_value = para.add_run(value) para_value.bold = True jobs = db.get_csv_json_text_data() for job in jobs: title = document.add_paragraph(job[1], style='Title') title.alignment = 1 employer_name = document.add_heading(job[2], level=1) employer_name.alignment = 1 add_line('Location', job[4]) add_line('Level', job[6]) add_line('Number of Openings', job[5]) add_line('Discipline', job[7]) add_line('Job Id', job[0]) if (len(job[10]) > 10): #show Comments header only if the job contains comments document.add_heading('Comments', level=2) document.add_paragraph(job[10]) # document.add_paragraph('\n\n\n\n\n\n\n') document.add_heading('Summary', level=2) #every time a <br /> occurs, add a linebreak to the current document print('The type of job[11] is: ' + str(type(job[11]))) summary = job[11].replace('<br />', '\n').replace(''', "'").replace(' ', ' ') document.add_paragraph(summary) document.add_page_break() document.save('jobs_EZSearch.docx')
soup = BeautifulSoup(source, 'lxml') mainSwiper = soup.find('div',class_='swiper-wrapper clearfix') swiper = mainSwiper.find_all('div', class_='swiper-slide') i=0 List = [] Thumbnails = [] for news in swiper: List.append(news.a.get('href')) Thumbnails.append(news.img.get('src')) i = i + 1 for links in List: counter = 0 inner_source = requests.get(links).text inner_soup = BeautifulSoup(inner_source,'lxml') article = inner_soup.find('div', class_='content-element') if article != None: paragraphs = article.find_all('p') print(paragraphs) for p in paragraphs: if(counter!=0): document.add_paragraph(p.text) else: document.add_paragraph(p, style='IntenseQuote') counter = counter+1 document.save(f'{name}.docx')
txt = txt.replace('.' + clean_up[i], '@' + clean_up[i]) txt = txt.replace(clean_up[i] + ']', clean_up[i] + 'l') txt = txt.replace('a]', 'al') txt = txt.replace('a[', 'al') txt = txt.replace(clean_up[i] + '[', clean_up[i] + 'l') txt = txt.replace(' ] ', ' I ') txt = txt.replace(' [ ', ' I ') numParagraphs = txt.count('\n\n') + 1 txt_list = txt.split('\n\n') #final_text.append(txt ocr_raw = Document() ocr_raw.add_paragraph('') ocr_raw.paragraphs[0].add_run( txt) # Each txt is generated per a page from the PDF ocr_raw.save('OCR-raw.docx') user_input = input( "Type 'p' for PARAGRAPHS or 'c' for CHAPTERS (include quotes): ") document = Document('OCR-raw.docx') user_input = str(user_input) if user_input == 'p': section_string = "PARAGRAPH " else: section_string = "CHAPTER " new_document = Document('PB-Template.docx') #numParagraphs = len(document.paragraphs) numTemplateParagraphs = len(new_document.paragraphs)
def export_doc_single(request, *args, **kwargs): """Function to export a single QAPP object as a Word Docx file.""" qapp_id = kwargs.get('pk', None) qapp_info = get_qapp_info(request.user, qapp_id) if not qapp_info: return HttpResponseRedirect(request) filename = '%s.docx' % slugify(qapp_info['qapp'].title) document = Document() add_custom_headers(document) styles = document.styles # ################################################# # BEGIN COVER PAGE # ################################################# # Coversheet with signatures section: # TODO: Add top row logo and blue background label run = document.add_paragraph().add_run() try: if DEBUG: logo = path.join(STATIC_ROOT, 'EPA_Files', 'loogo.png') qual_assur_proj_plan = path.join( STATIC_ROOT, 'images', 'quality_assurance_project_plan.PNG') else: logo = static('logo.png') qual_assur_proj_plan = static('quality_assurance_project_plan.PNG') run.add_picture(logo, width=Inches(1.5)) run.add_text('\t\t\t') run.add_picture(qual_assur_proj_plan, width=Inches(3)) except FileNotFoundError: print('couldn\'t find the static images!') # TODO Make blue_header text white, add blue background with shadow # document.add_picture('blue_background.png', width=Inches(4)) # background color: rgb(0, 176, 240) # The rest of the document will be WD_ALIGN_PARAGRAPH.CENTER # blank line add_center_heading(document, 'Office of Research and Development', 1) add_center_heading(document, qapp_info['qapp'].division.name, 1) # blank line # Next few sections are from the qapp object add_center_heading(document, qapp_info['qapp'].division_branch, level=3) # blank line add_center_heading(document, 'EPA Project Lead', level=2) for lead in qapp_info['qapp_leads']: add_center_heading(document, lead.name, level=3) # blank line add_center_heading(document, qapp_info['qapp'].intra_extra, level=3) add_center_heading(document, qapp_info['qapp'].qa_category, level=3) add_center_heading(document, qapp_info['qapp'].revision_number, level=3) add_center_heading(document, str(qapp_info['qapp'].date), level=3) # blank line add_center_heading(document, 'Prepared By', level=2) add_center_heading(document, '%s %s' % (qapp_info['qapp'].prepared_by.first_name, qapp_info['qapp'].prepared_by.last_name,), level=3) # blank line add_center_heading(document, qapp_info['qapp'].strap, level=3) add_center_heading(document, qapp_info['qapp'].tracking_id, level=3) # ################################################# # END COVER PAGE # BEGIN APPROVAL PAGE # ################################################# add_custom_heading(document, 'A.1 Approval Page', level=2) # Signature grid ... num_signatures = len(qapp_info['signatures']) table = document.add_table(rows=6+num_signatures, cols=12) table.style = styles['Table Grid'] set_table_row_height(table) row_cells = table.rows[0].cells row_cells[0].text = 'QA Project Plan Title:' row_cells[0].merge(row_cells[3]) row_cells[4].text = qapp_info['qapp_approval'].project_plan_title row_cells[4].merge(row_cells[11]) row_cells = table.rows[1].cells row_cells[0].text = 'QA Activity Number:' row_cells[0].merge(row_cells[3]) row_cells[4].text = qapp_info['qapp_approval'].activity_number row_cells[4].merge(row_cells[11]) # TODO: Center text in this row: row_cells = table.rows[2].cells row_cells[0].text = 'If Intramural or Extramural, EPA Project Approvals' row_cells[0].merge(row_cells[11]) iter_count = 0 # TODO: Iterate through EPA Project Approvals: # Start with row 3 + iter_count++ for sig in qapp_info['signatures']: if not sig.contractor: row_cells = table.rows[3 + iter_count].cells row_cells[0].text = 'Name:' row_cells[1].text = sig.name row_cells[1].merge(row_cells[3]) row_cells[4].text = 'Signature/Date:' row_cells[4].merge(row_cells[5]) row_cells[6].merge(row_cells[11]) iter_count += 1 # Always insert a blank entry for hand-written approval sigs row_cells = table.rows[3 + iter_count].cells row_cells[0].text = 'Name:' row_cells[1].merge(row_cells[3]) row_cells[4].text = 'Signature/Date:' row_cells[4].merge(row_cells[5]) row_cells[6].merge(row_cells[11]) # TODO: Center text in this row: row_cells = table.rows[4 + iter_count].cells row_cells[0].text = 'If Extramural, Contractor Project Approvals' row_cells[0].merge(row_cells[11]) # TODO: Iterate through Contractor Project Approvals: # Start with row 5 + iter_count++ for sig in qapp_info['signatures']: if sig.contractor: row_cells = table.rows[5 + iter_count].cells row_cells[0].text = 'Name:' row_cells[1].text = sig.name row_cells[1].merge(row_cells[3]) row_cells[4].text = 'Signature/Date:' row_cells[4].merge(row_cells[5]) row_cells[6].merge(row_cells[11]) iter_count += 1 # Always insert a blank entry for hand-written approval sigs row_cells = table.rows[5 + iter_count].cells row_cells[0].text = 'Name:' row_cells[1].merge(row_cells[3]) row_cells[4].text = 'Signature/Date:' row_cells[4].merge(row_cells[5]) row_cells[6].merge(row_cells[11]) document.add_page_break() # ################################################# # END APPROVAL PAGE # BEGIN ToC PAGE # ################################################# create_toc(document) document.add_page_break() # ################################################# # END ToC PAGE # BEGIN Everything Else PAGE # ################################################# # 1) Heading 1 - Revision History document.add_heading('Revision History', level=1) # 2) Table Label document.add_heading('Table 1 QAPP Revision History', level=3) # 3) Table (revision history) num_revisions = len(qapp_info['revisions']) table = document.add_table(rows=1+num_signatures, cols=3) table.style = styles['Light List'] row_cells = table.rows[0].cells row_cells[0].text = 'Revision Number' row_cells[1].text = 'Date Approved' row_cells[2].text = 'Revision' iter_count = 0 for rev in qapp_info['revisions']: row_cells = table.rows[1+iter_count].cells row_cells[0].text = rev.revision row_cells[1].text = str(rev.effective_date) row_cells[2].text = rev.description iter_count += 1 # TODO: Paragraphs aren't formatting properly, still double spaces... # Section A document.add_heading('Section A - Executive Summary', level=1) if qapp_info['section_a']: document.add_heading('A.3 Distribution List', level=2) document.add_paragraph( qapp_info['section_a'].a3, styles['No Spacing']) document.add_heading('A.4 Project Task Organization', level=2) document.add_paragraph( qapp_info['section_a'].a4, styles['No Spacing']) document.add_heading('A.5 Problem Definition Background', level=2) document.add_paragraph( qapp_info['section_a'].a5, styles['No Spacing']) document.add_heading('A.6 Project Description', level=2) document.add_paragraph( qapp_info['section_a'].a6, styles['No Spacing']) document.add_heading('A.7 Quality Objectives and Criteria', level=2) document.add_paragraph( qapp_info['section_a'].a7, styles['No Spacing']) document.add_heading('A.8 Special Training Certification', level=2) document.add_paragraph( qapp_info['section_a'].a8, styles['No Spacing']) document.add_heading('A.9 Documents and Records', level=2) document.add_paragraph( qapp_info['section_a'].a9, styles['No Spacing']) else: document.add_heading('SECTION A INCOMPLETE!', level=2) # Section B document.add_heading('Section B', level=1) if qapp_info['section_b']: sectionb_type = qapp_info['section_a'].sectionb_type.name section_b_info = SECTION_B_INFO[sectionb_type] for key in section_b_info: val = getattr(qapp_info['section_b'], key, '') if section_b_info[key].get('heading', False): document.add_heading(section_b_info[key]['heading'], level=2) document.add_heading(section_b_info[key]['label'], level=3) document.add_paragraph(val, styles['No Spacing']) else: document.add_heading('SECTION B INCOMPLETE!', level=2) # Section C document.add_heading('Section C', level=1) if qapp_info['section_c']: document.add_heading( 'C.1 Assessments and Response Actions', level=2) document.add_paragraph( qapp_info['section_c'].c1, styles['No Spacing']) document.add_heading('C.2 Reports to Management', level=2) document.add_paragraph( qapp_info['section_c'].c2, styles['No Spacing']) # document.add_heading('C.3 Quality Metrics (QA/QC Checks)', level=2) # document.add_paragraph( # qapp_info['section_c'].c3, # styles['No Spacing']) else: document.add_heading( 'C.1 Assessments and Response Actions', level=2) document.add_heading('C.2 Reports to Management', level=2) # Section D if qapp_info['section_d']: document.add_heading('Section D', level=1) document.add_heading( 'D.1 Data Review, Verification, and Validation', level=2) document.add_paragraph( qapp_info['section_d'].d1, styles['No Spacing']) document.add_heading( 'D.2 Verification and Validation Methods', level=2) document.add_paragraph( qapp_info['section_d'].d2, styles['No Spacing']) document.add_heading( 'D.3 Reconciliation with User Requirements', level=2) document.add_paragraph( qapp_info['section_d'].d3, styles['No Spacing']) else: document.add_heading('SECTION D INCOMPLETE!', level=2) # References document.add_heading('References', level=1) if qapp_info['references']: run = document.add_paragraph().add_run() run.add_text(qapp_info['references'].references) # document.add_paragraph( # qapp_info['references'].references.replace('\r\n\r\n', '\r\n'), # styles['No Spacing']) else: document.add_heading('REFERENCES SECTION INCOMPLETE!', level=2) content_type = 'application/vnd.openxmlformats-officedocument.' + \ 'wordprocessingml.document' response = HttpResponse(content_type) response['Content-Disposition'] = 'attachment; filename=%s' % filename document.save(response) response['filename'] = filename return response
import requests from bs4 import BeautifulSoup import os import docx from docx import Document from docx.shared import Inches url = 'https://www.qiushibaike.com/article/119757360' html = requests.get(url).content soup = BeautifulSoup(html,'html.parser') wen = soup.find('div',{"class":"content"}).text img = str(soup.find('div',{"class":"thumb"})).split('src="')[1].split('"/')[0] tu = 'https:' + img img_name = img.split('/')[-1] print(img) #保存图片至本地 with open(img_name,'wb')as f: response = requests.get(tu).content f.write(response) f.close() document = Document() document.add_paragraph(wen)#向文档里添加文字 document.add_picture(img_name)#向文档里添加图片 document.save('tuwen.doc')#保存文档 os.remove(img_name)#删除保存在本地的图片
sec.top_margin = distance sec.bottom_margin = distance sec.page_width = Inches(12) # 设置页面宽度 sec.page_height = Inches(20) # 设置页面高度 # 设置默认字体 chg_font(doc.styles['Normal'], fontname='宋体') # 步骤三.在文档对象中加入段落文本、表格、图像等,并指定其样式 # 1.添加段落文本 paragraph = doc.add_paragraph('text....') ph_format = paragraph.paragraph_format ph_format.space_before = Pt(10) # 设置段前间距 ph_format.space_after = Pt(12) # 设置段后间距 ph_format.line_spacing = Pt(19) # 设置行间距 # 2.添加表格,并填写相关内容 tab = doc.add_table(rows=4, cols=4) # 添加一个4行4列的空表 cell = tab.cell(1, 3) # 获取某单元格对象(从0开始索引) cell.text = 'abc'
def article_summarizer_BERT(path, name, ratio): articles_list = articles(path) articles_lists = [] for art in articles_list: articles_lists.append(sent_tokenize(art)) model = Summarizer() summaries = [] for x in articles_list: result = model(x, ratio=ratio, min_length=60) summaries.append(result) titles_list = [] df = xls_to_csv(path) for i in range(len(df)): titles_list.append(df['Title'][i]) document = Document() document.add_heading(name, 0) for i in range(len(summaries)): document.add_heading('Résumé : ' + titles_list[i], level=1) resume = document.add_paragraph(summaries[i]) resume.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY error1 = resume.text.find(').') while error1 > -1 and error1 + 2 <= len( resume.text) - 1 and resume.text[error1 + 2] != ' ': pre = resume.text[:error1 + 2] post = resume.text[error1 + 2:] error1 = post.find(').') resume.text = pre resume.add_run(' ') resume.add_run(post) error2 = resume.text.find(' .') while error2 > -1: pre = resume.text[:error2] post = resume.text[error2 + 2:] resume.text = pre resume.add_run('. ') resume.add_run(post) error2 = resume.text.find(' .') document.add_heading(titles_list[i], level=1) para = document.add_paragraph() for j in range(len(articles_lists[i])): resume = sent_tokenize(document.paragraphs[2].text) if articles_lists[i][j] in summaries[i]: run = para.add_run(articles_lists[i][j]) run.font.highlight_color = WD_COLOR_INDEX.YELLOW para.add_run(' ') else: para.add_run(articles_lists[i][j]) para.add_run(' ') para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY file_name = name + '.docx' document.save(file_name)
count_pages_int = 0 if type_table[0] in publication_type and type_table[0] == 'статья': count_pages_int = (int(count_pages[3]) * 10 + int(count_pages[4]) - int(count_pages[0]) * 10 - int( count_pages[1])) / 16 type_table[2] += count_pages_int type_table[1] += 1 journals_publish_name.append(info["WorkName"]) author_id_number += 1 for author in info["Authors"]: journals_authors_names.append([author["Name"], author_id_number]) journals_output_data.append(info["PublicationMeta"]["ЖУРНАЛ"]) journals_count_pages.append(count_pages_int) document = Document() paragraph_1 = document.add_paragraph() paragraph_1.add_run('Приложение 2') paragraph_1.alignment = 2 paragraph_2 = document.add_paragraph() paragraph_2.add_run('к Распоряжению № _____ от ______ г.') paragraph_2.alignment = 2 document.add_heading('РЕЗУЛЬТАТИВНОСТЬ НАУЧНО-ИССЛЕДОВАТЕЛЬСКОЙ ДЕЯТЕЛЬНОСТИ КАФЕДРЫ «Информационная безопасность» В' + ' ' + str(max_year) + ' ' + 'ГОДУ', 1) all_publish = document.add_table(rows=len(sort_tables[:6]) + 1, cols=2) all_publish.style = 'Table Grid' heading_all_publish = all_publish.rows[0].cells heading_all_publish[0].paragraphs[0].add_run('Показатель').bold = True heading_all_publish[1].paragraphs[0].add_run('Количество').bold = True iterator = 1 for info in sort_tables[:6]: all_publish_row_data = all_publish.rows[iterator].cells
class Print_document(): def start_doc(self): self.document = Document() def reinitialize_doc(self): self.document = Document('Temp.docx') def initialize_doc(self): sections = self.document.sections for section in sections: section.top_margin = Cm(2.54) section.bottom_margin = Cm(2.54) section.left_margin = Cm(2.54) section.right_margin = Cm(2.54) style = self.document.styles['Normal'] font = style.font font.name = 'Times New Roman' font.size = Pt(14) style = self.document.styles['Heading 2'] font1 = style.font font1.name = 'TimesNewRoman' font1.size = Pt(16) header = self.document.sections[0].header ht0 = header.add_paragraph() kh = ht0.add_run() kh.add_picture('Pristine.png', width=Inches(2)) kh.alignment = WD_ALIGN_PARAGRAPH.LEFT footer = self.document.sections[0].footer f = footer.add_paragraph( 'All Rights Reserved by Pristine InfoSolutions Pvt. Ltd.') f.alignment = WD_ALIGN_PARAGRAPH.CENTER f.style = self.document.styles['Normal'] f.bold = True f.size = Pt(16) def setVname(self, Vname): self.document.add_heading('Vulnerability Name:', 2) p = self.document.add_paragraph(Vname) p.style = self.document.styles['Normal'] def setVSeverity(self, severity): p = self.document.add_heading('Severity', 2) p.style = self.document.styles['Heading 2'] p.bold = True p.size = Pt(16) p.name = 'TimesNewRoman' p = self.document.add_paragraph(severity) p.style = self.document.styles['Normal'] def SetVdesc(self, VDesc): vuldesh = self.document.add_heading('Vulnerability Description:', 2) p = self.document.add_paragraph(VDesc) def setVurl(self, Vurl): self.document.add_heading('Vulnerable URL: ', 2) p = self.document.add_paragraph(Vurl) p.style = self.document.styles['Normal'] def setImg(self, Img): self.document.add_heading('Proof of Concept: ', 2) if (Img): lengthImg = len(Img[0]) for i in range(0, lengthImg): self.document.add_picture(Img[0][i], width=Cm(15.95)) def setImpact(self, VImpact): self.document.add_heading('Impact: ', 2) p = self.document.add_paragraph(VImpact) p.style = self.document.styles['Normal'] def setVremed(self, Vrem): self.document.add_heading('Remediation', 2) p = self.document.add_paragraph(Vrem) p.style = self.document.styles['Normal'] def setConclusion(self, Conclusion): self.document.add_heading('Conclusion', 2) p = self.document.add_paragraph(Conclusion) p.style = self.document.styles['Normal'] def pageBreak(self): self.document.add_page_break() def Savedoc(self, name): self.document.save(name[0] + '.docx') def Savereport(self): self.document.save('Temp.docx')
def train_GradientBoosting(self, X, y, title): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # Set the parameters by cross-validation tuned_parameters = [{ 'learning_rate': [0.01, 0.05, 0.1], 'n_estimators': [100, 200, 300], 'max_features': ['auto', 'sqrt', 0.2, 0.4, 0.6] }] scores = ['neg_mean_squared_error'] for score in scores: document = Document() document.add_heading('{}_GradientBoosting_{}'.format(title, score), 0) document.add_paragraph("# Tuning hyper-parameters for %s" % score) clf = GridSearchCV(GradientBoostingRegressor(), tuned_parameters, cv=5, scoring=score) clf.fit(X_train, y_train) document.add_paragraph( "Best parameters set found on development set:") document.add_paragraph(clf.best_params_) document.add_paragraph("Grid scores on development set:") means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): document.add_paragraph("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) document.add_paragraph() document.add_paragraph( "The scores are computed on the full evaluation set") y_true, y_pred = np.squeeze(y_test).tolist(), clf.predict( X_test).tolist() print(clf.best_estimator_.feature_importances_) self.saver.save_pickle( data=clf, filename='GradientBoosting_{}'.format(title)) pre_test = pd.DataFrame({'y_true': y_true, 'y_pre': y_pred}) self.saver.save_excel(pre_test, filename='predictions_gradientboosting', foldername='Model') document.add_paragraph(str(mean_squared_error(y_true, y_pred))) fold_path = os.path.join(self.datasavedir, 'doc', 'Model') save_path = self.check_savepath( foldpath=fold_path, filename='{}_GradientBoosting_{}.docx'.format(title, score)) document.save(save_path)
from bs4 import BeautifulSoup import requests from docx import Document import os if not os.path.isdir('./scpArchive'): os.mkdir('./scpArchive') for i in range(2, 10): scp = requests.get("http://www.scp-wiki.net/scp-00" + str(i)) soup = BeautifulSoup(scp.text, 'html.parser') main_text = soup.select('p') file = Document() for j in range(8, len(main_text) - 1): try: file.add_paragraph(main_text[j].getText()) except ValueError: file.add_paragraph('Line ' + str(j) + " had an error") file.save('./scpArchive/scp-00' + str(i) + ".docx") for i in range(10, 100): scp = requests.get("http://www.scp-wiki.net/scp-0" + str(i)) soup = BeautifulSoup(scp.text, 'html.parser') main_text = soup.select('p') file = Document() for j in range(8, len(main_text) - 1): try: file.add_paragraph(main_text[j].getText()) except ValueError: file.add_paragraph('Line ' + str(j) + " had an error") file.save('./scpArchive/scp-0' + str(i) + ".docx")
class Docx: NEWLINE_XML = '</w:t><w:br/><w:t xml:space="preserve">' NEWPARAGRAPH_XML = '</w:t></w:r></w:p><w:p><w:r><w:t xml:space="preserve">' TAB_XML = '</w:t></w:r><w:r><w:tab/></w:r><w:r><w:t xml:space="preserve">' PAGE_BREAK = '</w:t><w:br w:type="page"/><w:t xml:space="preserve">' def __init__(self,doc_name): if doc_name: self.doc = Document(doc_name) else: self.doc = Document() self.doc_name = doc_name def add_heading(self,title_level,title): ''' 添加标题 title_level 表示添加"标题xx",itle_level为1,2,3... title 标题名称 ''' self.doc.add_heading(title, level=title_level) def add_paragraph(self,content): return self.doc.add_paragraph(content) def set_paragraph_property(self,paragraph,blod,font_size,font_color,bk_color): pass def add_table(self,row=1,col=1,style=None): table = self.doc.add_table(rows=row,cols=col, style='Table Grid') table.autofit = False return table def get_cell(self,table,row,col): ''' 获取表格指定行和列的单元格 table为_Table对象,可通过add_table函数获取 row 为cell所在的行,索引从0,1,2...表示第1,2,3...行 colo为cell所在的列,索引从0,1,2...表示第1,2,3...列 ''' return table.rows[row].cells[col] # return table.cell(row,col) def merge_cell(self,cells): merged_cell = cells[0] if len(cells) < 2: return merged_cell else: text = '' for i in range(1,len(cells)): merged_cell = merged_cell.merge(cells[i]) if merged_cell.paragraphs and len(merged_cell.paragraphs) > 0: text = merged_cell.text.replace('\n','') merged_cell.text = text return merged_cell def set_cell_background(self,cell,bk_color=None): ''' 设置单元格背景颜色 cell: _Cell对象,可通过get_cell函数获取 bk_color: RGB字符串,例如"AABBCC" ''' xml = r'<w:shd {0} w:fill="{1}"/>'.format(nsdecls('w'),bk_color) # print(xml) shading_elm_1 = parse_xml(xml) cell._tc.get_or_add_tcPr().append(shading_elm_1) def set_cell_text(self,cell,text): ''' 设置单元格文本 cell _Cell对象,可通过get_cell函数获取 text 需要添加的文本内容,若cell中已经有文本,则被替换掉 ''' paragraph = None if cell.paragraphs and len(cell.paragraphs) > 0: paragraph = cell.paragraphs[-1] #取最后一个索引的段落 paragraph.text = text else: paragraph = cell.add_paragraph(text) paragraph.first_line_indent = Inches(-10) paragraph.paragraph_format.first_line_indent = Inches(0) # p.aligment = WD_ALIGN_PARAGRAPH.LEFT # p.left_indent = Inches(0) # paragraph_format = p.paragraph_format # paragraph_format.left_indent = Inches(0) # cell.text = text def set_cell(self,cell,text, style=None, color=None, highlight=None, size=None, subscript=None, superscript=None, bold=False, italic=False, underline=False, strike=False, font=None, url_id=None): # If not a string : cast to string (ex: int, dict etc...) if not isinstance(text, (six.text_type, six.binary_type)): text = six.text_type(text) if not isinstance(text, six.text_type): text = text.decode('utf-8',errors='ignore') text = (escape(text) .replace('\n', self.NEWLINE_XML) .replace('\a', self.NEWPARAGRAPH_XML) .replace('\t', self.TAB_XML) .replace('\f', self.PAGE_BREAK)) prop = u'' if style: prop += u'<w:rStyle w:val="%s"/>' % style if color: if color[0] == '#': color = color[1:] prop += u'<w:color w:val="%s"/>' % color if highlight: if highlight[0] == '#': highlight = highlight[1:] prop += u'<w:highlight w:val="%s"/>' % highlight if size: prop += u'<w:sz w:val="%s"/>' % size prop += u'<w:szCs w:val="%s"/>' % size if subscript: prop += u'<w:vertAlign w:val="subscript"/>' if superscript: prop += u'<w:vertAlign w:val="superscript"/>' if bold: prop += u'<w:b/>' if italic: prop += u'<w:i/>' if underline: if underline not in ['single','double']: underline = 'single' prop += u'<w:u w:val="%s"/>' % underline if strike: prop += u'<w:strike/>' if font: prop += ( u'<w:rFonts w:ascii="{font}" w:hAnsi="{font}" w:cs="{font}"/>' .format(font=font) ) xml = u'<w:r>' if prop: xml += u'<w:rPr>%s</w:rPr>' % prop xml += u'<w:t xml:space="preserve">%s</w:t></w:r>' % text if url_id: xml = ( u'<w:hyperlink r:id="%s" w:tgtFrame="_blank">%s</w:hyperlink>' % (url_id, xml) ) print(xml) tcPr = parse_xml(xml) cell._tc.get_or_add_tcPr().append(tcPr) def close(self): self.doc.save(self.doc_name) def save_as(self,doc_name): self.doc.save(doc_name)
def exam_export(exam): document = Document() document.add_heading((exam.exam_name + "(Exam)"), 0) document.add_heading('Overall information', level=1) document.add_paragraph('ID: ' + str(exam.id)) document.add_paragraph('Name: ' + exam.exam_name) document.add_paragraph('Pass time: ' + exam.pass_time) document.add_paragraph('Status: ' + exam.status) document.add_paragraph('Score: ' + str(exam.score)) document.add_paragraph('Enrollee: ' + str(exam.get_enrollee())) document.add_paragraph('Examiner: ' + str(exam.get_examiner())) document.save('..\\..\\Reports\\Exams\\' + exam.exam_name + '.(' + str(exam.id) + ').docx')
def image_extract(): file_dir = os.getcwd() directory = 'resumes/' # exploring the directory for all jpg files for file in os.listdir(directory): if file.endswith(".jpg") | file.endswith(".jpeg") | file.endswith( ".png"): full_path = os.path.join(directory, file) #file_path = file_dir + "/resumes/" + str(file) # reading file with cv2 img = cv2.imread(full_path) ratio = img.shape[0] / 500.0 original_img = img.copy() # converting image into grayscale gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # blurring and finding edges of the image blurred = cv2.GaussianBlur(gray, (5, 5), 0) edged = cv2.Canny(gray, 75, 200) # applying threshold to grayscale image thresh = cv2.threshold(gray, 225, 255, cv2.THRESH_BINARY_INV)[1] # finding contours (cnts, _) = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # draw contours on image cv2.drawContours(img, cnts, -1, (240, 0, 159), 3) H, W = img.shape[:2] for cnt in cnts: x, y, w, h = cv2.boundingRect(cnt) if cv2.contourArea(cnt) > 100 and (0.7 < w / h < 1.3) and ( W / 4 < x + w // 2 < W * 3 / 4) and (H / 4 < y + h // 2 < H * 3 / 4): break # creating mask and performing bitwise-op mask = np.zeros(img.shape[:2], np.uint8) cv2.drawContours(mask, [cnt], -1, 255, -1) dst = cv2.bitwise_and(img, img, mask=mask) # displaying image and saving in the directory gray = cv2.cvtColor(dst, cv2.COLOR_BGR2GRAY) gray = cv2.medianBlur(gray, 3) gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] scanned_file_name = "resumes/" + str(file[:-4]) + "-Scanned.png" cv2.imwrite(scanned_file_name, dst) # fetching text from the image and storing it into a text file file_text = pytesseract.image_to_string( Image.open(scanned_file_name)) text_file_name = "resumes/" + str(file) + ".txt" ### if we need the jpg extension word_file_name = "resumes/" + str(file) + ".docx" with open(text_file_name, "a") as f: f.write(file_text + "\n") for i in os.listdir(directory): document = Document() myfile = open(text_file_name).read() myfile = re.sub(r'[^\x00-\x7F]+|\x0c', ' ', myfile) # remove all non-XML-compatible characters p = document.add_paragraph(myfile) document.save(word_file_name) os.remove(scanned_file_name) os.remove(text_file_name) return word_file_name
class Visualizer: """ Reads in the log files and extracts the necessary information needed to a docx (pdf later) file Arguments: - path_logs: the path to the logs - mturk_session: the mturk dev session Returns: - A word file which shows the necessary information from the logs """ def __init__(self, path_logs, mturk_session): self.path_results = os.path.join(path_logs + "/output/", mturk_session) self.path_logs = os.path.join(path_logs, mturk_session) def run(self, global_rdr): # This if condition is for using the imported reader class # Differentiates between when called from here or from the run.py file (change style later) if not global_rdr: logs_reader = rdr.ReadLogs(self.path_logs) else: logs_reader = global_rdr.ReadLogs(self.path_logs) # logs_reader = grdr.ReadLogs(self.path_logs, self.mturk_session) # This returns a list of the speakers userid in each meetup room and the corresponding list of json logtexts list_users, list_logtext = logs_reader.run() for _users, single_logtext in zip(self.list_users, self.list_logtext): self._write_to_docx(_users, single_logtext) print("DONE") def _write_to_docx(self, _users, single_logtext, includeProcessed=False, single_sample=None): """ This function creates a document (word file) for each logfile and writes the necessary information into it (pdf later) """ ########################################################## # Create a dict for structuring the inputs # ########################################################## dict_inputs = { 'table_traits': ['Traits / Attitudes', 1, 1, 'Persona'], 'table_facts': ['Facts / Knowledge', 1, 2, 'Entity', 'Content'], 'table_questions': ['Questions', 1, 1, ''], 'table_answers': ['Answers', 1, 2, 'Entity', 'Answer'] } story = None for item in single_logtext: if item['user']['name'] == "Moderator" and item[ 'type'] == "story_type": story = item['story_type'] if item['type'] == "table_entities": table_entity = item['table'] # Create document for saving the conversation and other factors # Create a new document for each meetup log self.document = Document() if story is None: self.document.add_heading("Story type not specified", 0) else: self.document.add_heading(story, 0) curr_userid = 0 dialogue_list = [] dialogue_tok_list = [] for item in single_logtext: ###################################################################################### # Input necessary info using dict_inputs # ###################################################################################### # Write the facts and traits for both users for _id in _users: if _id == _users[0]: speaker = 'First speaker' else: speaker = 'Second speaker' for input, params in dict_inputs.items(): if item['type'] == input and item['user']['id'] == _id: if input == 'table_traits': # Only add this heading at the first table of each speaker p = self.document.add_heading(speaker, level=1) p.add_run().bold = True # p.add_run('bold').bold = True # p.add_run('italic.').italic = True p = self.document.add_heading(params[0], level=2) p.add_run().italic = True if len( params ) > 4: # e.g for case: table_facts and table_answers table = self.document.add_table(rows=params[1], cols=params[2]) # table.style = 'Light Grid Accent 1' table.style = 'Table Grid' hdr_cells = table.rows[0].cells hdr_cells[0].text = params[3] hdr_cells[1].text = params[4] # Input facts correspondingly (key = topic, value = fact) # Note: zip only works if the two iterations are of same length for _entity, _content in zip( item['table'][0], item['table'][1]): row_cells = table.add_row().cells row_cells[0].text = _entity row_cells[1].text = _content else: # e.g for case: table_attributes and table_questions for _content in item['table']: if input == 'table_traits': self.document.add_paragraph(_content, style='List') else: self.document.add_paragraph( _content, style='List Bullet') ###################################################### # Rearrange the dialogue speaker turns # ###################################################### # Rearrange the dialogue orders so that its just one line per speaker turn if item['type'] == 'text' and item['user']['id'] in _users: if item['user']['id'] == curr_userid: # Update the dialogue_list if the same speaker has used two turns msg = tmp_msg + " [EOU] " + item['msg'] dialogue_list[len(dialogue_list) - 1] = msg tmp_msg = msg else: dialogue_list.append(item['msg']) # tmp_msg is used for when a speaker uses two turns (to put each of them together) tmp_msg = item['msg'] curr_userid = item['user']['id'] if item['type'] == "join" and item['user']['name'] == "Moderator": uid = item['timestamp-iso'] + "-" + item['room']['name'] # use for naming the file # this replace is performed because files cant be saved with character ":" fname = uid.replace(":", "-") fname = fname + ".docx" # -------------------------------- END OF FOR LOOP --------------------------------# ##################################################### # Input entities provided # ##################################################### e = self.document.add_heading('Entities Provided To Users', level=1) e.add_run().bold = True for ent in table_entity: self.document.add_paragraph(ent, style='List Bullet') ################################################ # Input the dialogue # ################################################ p = self.document.add_heading('Dialogue', level=1) p.add_run().bold = True for y in range(0, len(dialogue_list)): if y % 2 == 0: # Even for First speaker self.document.add_paragraph(dialogue_list[y]) else: # Odd numbers for Second speaker paragraph = self.document.add_paragraph(dialogue_list[y]) # Make this paragraph to align right paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT if includeProcessed: self._include_processed_dialogue(single_sample) ###################### # Save # ###################### self._saver(self.document, fname) print("\nVisualization result has been saved into {}".format( self.path_results)) return True def _include_processed_dialogue(self, single_sample): """ Create a document (word file) for the processed logfile single_sample_output.update({ 'uid': uid, 'story': story, 'entity_provided': table_entity, 'knowledge': knowledge_dict, 'attitudes': attitudes_dict, 'questions': question_dict, 'answers': answer_dict, 'named_entities': 'empty', 'dialogue_orig': self.dialogue, 'dialogue_tokenized': dialogue_tokenized, 'dialogue_processesd': 'empty', 'dialogue_named_entity': 'empty' }) """ ######################################### # Add the other dialogues # ######################################### dict_inputs = { 'dialogue_original': 'Original Dialogue', 'dialogue_processesd': 'Processesd Dialogue', 'dialogue_named_entity': 'Named Entity Dialogue' } ################################################################### # Input the selected processed dialogue # ################################################################### for entity in single_sample: if entity in dict_inputs: p = self.document.add_heading(dict_inputs[entity], level=1) p.add_run().bold = True if single_sample[entity] == 'empty': self.document.add_paragraph( "{} not yet added".format(entity)) else: dialogue_list = single_sample[entity] for y in range(0, len(dialogue_list)): if y % 2 == 0: # Even for First speaker self.document.add_paragraph(dialogue_list[y]) else: # Odd numbers for Second speaker paragraph = self.document.add_paragraph( dialogue_list[y]) # Make this paragraph to align right paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT def _saver(self, _document, _fname): if not os.path.exists(self.path_results): os.makedirs(self.path_results) doc_path = os.path.join(self.path_results, _fname) _document.save(doc_path)
def get_info(self, id, path): link = 'https://www.iesdouyin.com/share/user/' + str(id) response = requests.get(link, headers=self.__headers) html = pq(response.text) imgUrl = html( '#pagelet-user-info > div.personal-card > div.info1 > span.author > img' ).attr('src') headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36' } tempRsp = requests.get(imgUrl, headers=headers) name = html( '#pagelet-user-info > div.personal-card > div.info1 > p.nickname' ).text() self.__create_path(path, name) file = open(path + '/' + name + '/temp.png', 'wb') file.write(tempRsp.content) file.close() info = html( '#pagelet-user-info > div.personal-card > div.info2 > div > span' ).text() signature = html( '#pagelet-user-info > div.personal-card > div.info2 > p.signature' ).text() id = self.__trans( re.compile('<p class="shortid">(.*?)</p>').findall( response.text)[0]) focus = self.__trans( re.compile('<span class="num">(.*?)</span>').findall( response.text)[0]) follow = self.__trans( re.compile( '<span class="follower block"><span class="num">(.*?)</span> </span>' ).findall(response.text)[0]) liked = self.__trans( re.compile( '<span class="liked-num block"><span class="num">(.*?)</span>' ).findall(response.text)[0]) production = self.__trans( re.compile( '<div class="user-tab active tab get-list" data-type="post">(.*?)</div>' ).findall(response.text)[0]) like = self.__trans( re.compile( '<div class="like-tab tab get-list" data-type="like">(.*?)</div>' ).findall(response.text)[0]) id = pq(id).text().replace(' ', '').replace('抖音ID:', '') focus = pq(focus).text().replace(' ', '') follow = pq(follow).text().replace(' ', '').replace('粉丝', '') liked = pq(liked).text().replace(' ', '') production = pq(production).text().replace(' ', '').replace('作品', '') like = pq(like).text().replace(' ', '').replace('喜欢', '') document = Document() document.add_heading(name, 0) document.add_picture(path + '/' + name + '/temp.png', width=Inches(1.25)) self.__add_param(document, '抖音ID', id) document.add_paragraph(info) document.add_paragraph(signature) self.__add_param(document, '关注', focus) self.__add_param(document, '粉丝', follow) self.__add_param(document, '赞', liked) self.__add_param(document, '作品', production) self.__add_param(document, '喜欢', like) document.save(path + '/' + name + '/info.docx') os.remove(path + '/' + name + '/temp.png')
import csv doc_path = "C:/Users/Andrew/Documents/Vocab/Adverb-Stage3-Vocab-excercise.docx" my_doc = Document(doc_path) # 5 words in a group for i in range(25): with open('C:/Users/Andrew/Documents/Vocab/AdvYearmiddlelist.csv', 'r') as file: start = i * 10 n = 0 finish = start + 10 print(start) print(finish) word_list = csv.reader(file.readlines()[start:finish]) my_doc.add_paragraph('------ Exercise ------') word_group = [] # print the questions for item in word_list: url_word = ''.join(item) word_group.append(url_word) print(word_group) n = n + 1 # url = "https://dictionary.cambridge.org/dictionary/english/" + url_word # url = "https://tangorin.com/sentences?search=" + url_word # hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0)'} # req = urllib.request.Request(url, headers=hdr) # page_html = urllib.request.urlopen(req).read() # page_soup = bs(page_html, "html.parser") # # my_divs = page_soup.find_all("div", {"class": 'examp dexamp'})
def generate(): document = Document() document.add_heading('病人病历报告', 0) # time tool time_tup = time.localtime(time.time()) # print(time_tup) format_time = '%Y-%m-%d %a %H:%M:%S' cur_time = time.strftime(format_time, time_tup) # print(cur_time) df = pd.read_csv('information.csv') name = df['Name'].values[0] volume = df['Volume'].values[0] mass = df['Mass'].values[0] sex = df['Sex'].values[0] age = df['Age'].values[0] # print(name, volume, mass) gf = pd.read_csv('image_information.csv') image_raw = gf['image_raw'].values image_masked = gf['image_masked'].values image_value = gf['value'].values # print(image_raw, image_raw, image_value) p = document.add_paragraph(cur_time) # p.add_run('bold').bold = True # p.add_run(' and some ') # p.add_run('italic.').italic = True document.add_heading('病人信息', level=1) # document.add_paragraph('肺结节诊断', style='Intense Quote') document.add_paragraph('姓名:', style='List Bullet') document.add_paragraph(' ' + name.capitalize()) document.add_paragraph('性别:', style='List Bullet') document.add_paragraph(' ' + sex.capitalize()) document.add_paragraph('年龄:', style='List Bullet') document.add_paragraph(' ' + str(age)) # document.add_paragraph( # 'first item in ordered list', style='List Number' # ) records = ((name, volume, mass)) # print(records) document.add_heading('肺结节情况', level=1) p = document.add_paragraph() r = p.add_run() for i in range(len(image_masked)): r.add_picture(image_raw[i], width=Inches(1.25)) r.add_text(' ') r.add_picture(image_masked[i], width=Inches(1.25)) r.add_text(' ') r.add_text(str(image_value[i])) table = document.add_table(rows=1, cols=3, style="Table Grid") hdr_cells = table.rows[0].cells hdr_cells[0].text = 'Name' hdr_cells[1].text = 'Volume' hdr_cells[2].text = 'Mass' row_cells = table.add_row().cells row_cells[0].text = name row_cells[1].text = str(volume) row_cells[2].text = str(mass) document.add_page_break() document.save('病历报告.docx')
class HtmlToDocx(HTMLParser): def __init__(self): super().__init__() self.options = { 'fix-html': True, 'images': True, 'tables': True, 'styles': True, } def set_initial_attrs(self, document=None): self.tags = { 'span': [], 'list': [], } if document: self.doc = document else: self.doc = Document() self.bs = self.options[ 'fix-html'] # whether or not to clean with BeautifulSoup self.document = self.doc self.include_tables = True #TODO add this option back in? self.include_images = self.options['images'] self.include_styles = self.options['styles'] self.paragraph = None self.skip = False self.skip_tag = None self.instances_to_skip = 0 def get_cell_html(self, soup): # Returns string of td element with opening and closing <td> tags removed if soup.find_all(): return '\n'.join(str(soup).split('\n')[1:-1]) return str(soup)[4:-5] def add_styles_to_paragraph(self, style): if 'text-align' in style: align = style['text-align'] if align == 'center': self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == 'right': self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT elif align == 'justify': self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY if 'margin-left' in style: margin = style['margin-left'] units = re.sub(r'[0-9]+', '', margin) margin = int(re.sub(r'[a-z]+', '', margin)) if units == 'px': self.paragraph.paragraph_format.left_indent = Inches( min(margin // 10 * INDENT, MAX_INDENT)) # TODO handle non px units def add_styles_to_run(self, style): if 'color' in style: if 'rgb' in style['color']: color = re.sub(r'[a-z()]+', '', style['color']) colors = [int(x) for x in color.split(',')] else: color = style['color'].lstrip('#') colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) self.run.font.color.rgb = RGBColor(*colors) if 'background-color' in style: if 'rgb' in style['background-color']: color = color = re.sub(r'[a-z()]+', '', style['background-color']) colors = [int(x) for x in color.split(',')] else: color = style['background-color'].lstrip('#') colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors def parse_dict_string(self, string, separator=';'): new_string = string.replace(" ", '').split(separator) string_dict = dict([x.split(':') for x in new_string if ':' in x]) return string_dict def handle_li(self): # check list stack to determine style and depth list_depth = len(self.tags['list']) if list_depth: list_type = self.tags['list'][-1] else: list_type = 'ul' # assign unordered if no tag if list_type == 'ol': list_style = "List Number" else: list_style = 'List Bullet' self.paragraph = self.doc.add_paragraph(style=list_style) self.paragraph.paragraph_format.left_indent = Inches( min(list_depth * LIST_INDENT, MAX_INDENT)) self.paragraph.paragraph_format.line_spacing = 1 def add_image_to_cell(self, cell, image): # python-docx doesn't have method yet for adding images to table cells. For now we use this paragraph = cell.add_paragraph() run = paragraph.add_run() run.add_picture(image) def handle_img(self, current_attrs): if not self.include_images: self.skip = True self.skip_tag = 'img' return src = current_attrs['src'] # fetch image src_is_url = is_url(src) if src_is_url: try: image = fetch_image(src) except urllib.error.URLError: image = None else: image = src # add image to doc if image: try: if isinstance(self.doc, docx.document.Document): self.doc.add_picture(image) else: self.add_image_to_cell(self.doc, image) except FileNotFoundError: image = None if not image: if src_is_url: self.doc.add_paragraph("<image: %s>" % src) else: # avoid exposing filepaths in document self.doc.add_paragraph("<image: %s>" % get_filename_from_url(src)) # add styles? def handle_table(self): """ To handle nested tables, we will parse tables manually as follows: Get table soup Create docx table Iterate over soup and fill docx table with new instances of this parser Tell HTMLParser to ignore any tags until the corresponding closing table tag """ table_soup = self.tables[self.table_no] rows, cols = self.get_table_dimensions(table_soup) self.table = self.doc.add_table(rows, cols) rows = table_soup.find_all('tr', recursive=False) cell_row = 0 for row in rows: cols = row.find_all(['th', 'td'], recursive=False) cell_col = 0 for col in cols: cell_html = self.get_cell_html(col) if col.name == 'th': cell_html = "<b>%s</b>" % cell_html docx_cell = self.table.cell(cell_row, cell_col) child_parser = HtmlToDocx() child_parser.add_html_to_cell(cell_html, docx_cell) cell_col += 1 cell_row += 1 # skip all tags until corresponding closing tag self.instances_to_skip = len(table_soup.find_all('table')) self.skip_tag = 'table' self.skip = True self.table = None def handle_starttag(self, tag, attrs): if self.skip: return if tag == 'head': self.skip = True self.skip_tag = tag self.instances_to_skip = 0 return elif tag == 'body': return current_attrs = dict(attrs) if tag == 'span': self.tags['span'].append(current_attrs) return elif tag == 'ol' or tag == 'ul': self.tags['list'].append(tag) return # don't apply styles for now elif tag == 'br': self.run.add_break() return self.tags[tag] = current_attrs if tag == 'p': self.paragraph = self.doc.add_paragraph() elif tag == 'li': self.handle_li() elif tag[0] == 'h' and len(tag) == 2: if isinstance(self.doc, docx.document.Document): h_size = int(tag[1]) self.paragraph = self.doc.add_heading(level=min(h_size, 9)) else: self.paragraph = self.doc.add_paragraph() elif tag == 'img': self.handle_img(current_attrs) return elif tag == 'table': self.handle_table() return # set new run reference point in case of leading line breaks if tag == 'p' or tag == 'li': self.run = self.paragraph.add_run() # add style if not self.include_styles: return if 'style' in current_attrs and self.paragraph: style = self.parse_dict_string(current_attrs['style']) self.add_styles_to_paragraph(style) def handle_endtag(self, tag): if self.skip: if not tag == self.skip_tag: return if self.instances_to_skip > 0: self.instances_to_skip -= 1 return self.skip = False self.skip_tag = None self.paragraph = None if tag == 'span': if self.tags['span']: self.tags['span'].pop() return elif tag == 'ol' or tag == 'ul': remove_last_occurence(self.tags['list'], tag) return elif tag == 'a': link = self.tags.pop(tag) href = link['href'] self.paragraph.add_run('<link: %s>' % href) return elif tag == 'table': self.table_no += 1 self.table = None self.doc = self.document self.paragraph = None if tag in self.tags: self.tags.pop(tag) # maybe set relevant reference to None? def handle_data(self, data): if self.skip: return if not self.paragraph: self.paragraph = self.doc.add_paragraph() self.run = self.paragraph.add_run(data) spans = self.tags['span'] for span in spans: if 'style' in span: style = self.parse_dict_string(span['style']) self.add_styles_to_run(style) # add font style for tag in self.tags: if tag in fonts: font_style = fonts[tag] setattr(self.run.font, font_style, True) def ignore_nested_tables(self, tables_soup): """ Returns array containing only the highest level tables Operates on the assumption that bs4 returns child elements immediately after the parent element in `find_all`. If this changes in the future, this method will need to be updated :return: """ new_tables = [] nest = 0 for table in tables_soup: if nest: nest -= 1 continue new_tables.append(table) nest = len(table.find_all('table')) return new_tables def get_table_dimensions(self, table_soup): rows = table_soup.find_all('tr', recursive=False) cols = rows[0].find_all(['th', 'td'], recursive=False) return len(rows), len(cols) def get_tables(self): if not hasattr(self, 'soup'): self.include_tables = False return # find other way to do it, or require this dependency? self.tables = self.ignore_nested_tables(self.soup.find_all('table')) self.table_no = 0 def run_process(self, html): if self.bs and BeautifulSoup: self.soup = BeautifulSoup(html, 'html.parser') html = remove_whitespace(str(self.soup)) else: html = remove_whitespace(html) if self.include_tables: self.get_tables() self.feed(html) def add_html_to_document(self, html, document): if not isinstance(html, str): raise ValueError('First argument needs to be a %s' % str) elif not isinstance(document, docx.document.Document) and not isinstance( document, docx.table._Cell): raise ValueError('Second argument needs to be a %s' % docx.document.Document) self.set_initial_attrs(document) self.run_process(html) def add_html_to_cell(self, html, cell): if not isinstance(cell, docx.table._Cell): raise ValueError('Second argument needs to be a %s' % docx.table._Cell) unwanted_paragraph = cell.paragraphs[0] delete_paragraph(unwanted_paragraph) self.set_initial_attrs(cell) self.run_process(html) # cells must end with a paragraph or will get message about corrupt file # https://stackoverflow.com/a/29287121 if not self.doc.paragraphs: self.doc.add_paragraph('') def parse_html_file(self, filename_html, filename_docx=None): with open(filename_html, 'r') as infile: html = infile.read() self.set_initial_attrs() self.run_process(html) if not filename_docx: path, filename = os.path.split(filename_html) filename_docx = '%s/new_docx_file_%s' % (path, filename) self.doc.save('%s.docx' % filename_docx)
letter = 'M' wordCount = 0 wordsPerFile = 20 fileCount = 1 docxName1 = 'test.docx' docxName2 = letter+str(fileCount)+'.docx' document1 = Document(docxName1) document2 = Document(docxName2) for paragraph in document1.paragraphs: if re.match('\d{4}', paragraph.text) is not None: wordCount += 1 if wordCount == 21: wordCount = 1 document2.save(docxName2) print('words in {}: 20'.format(docxName2)) fileCount += 1 docxName2 = letter + str(fileCount) + '.docx' document2 = Document(docxName2) paraInsert = document2.add_paragraph() fromParatoPara(paraInsert, paragraph) else: paraInsert = document2.add_paragraph() fromParatoPara(paraInsert, paragraph) document2.save(docxName2) print('words in {}: {}'.format(docxName2, wordCount)) print('finish, words count: {}'.format((fileCount-1) * wordsPerFile + wordCount))
def cargaexcel(request): if request.method == 'POST': uploaded_file = request.FILES['document'] fs = FileSystemStorage() name = fs.save(uploaded_file.name, uploaded_file) xls = pd.ExcelFile(BASE_DIR + fs.url(name)) hojas = xls.sheet_names print(len(hojas)) dc1 = Document() def mat(abcd): pri = 0 val = 0 aux = [] for i in range(15): aux.append([]) for j in range(5): aux[i].append(None) for i in range(len(abcd)): sec = 0 val = 0 for j in range(len(abcd[i])): if type(abcd[i][j]) is str: val += 1 aux[pri][sec] = abcd[i][j] sec += 1 else: if type(abcd[i][j]) is int: abcd[i][j] = str(abcd[i][j]) val += 1 aux[pri][sec] = abcd[i][j] sec += 1 else: if type(abcd[i][j]) is float: if abcd[i][j]/abcd[i][j]==1: abcd[i][j] = str(abcd[i][j]) val += 1 aux[pri][sec] = abcd[i][j] sec += 1 if val > 0: pri += 1 return aux aux=0 for cont in range(len(hojas)-1): cont += 1 df = xls.parse(hojas[cont]) fich = df.__array__() real = mat(fich); fich = real print(fich) tit = fich[0][0] anio = str(fich[4][1]) nota1 = str(fich[10][1]) nota2 = str(fich[10][2]) nota3 = str(fich[10][3]) promedio = str(fich[11][1]) dc1.add_heading(tit, 0) dc1.add_heading(fich[1][0], 2) p = dc1.add_paragraph() # p.add_run('\n') tbl1 = dc1.add_table(rows=0, cols=2) fila = tbl1.add_row().cells fila[0].text = fich[2][0] fila[1].text = fich[2][1] # p.add_run(fich[2][0] + ' \t').bold = True # p.add_run(fich[2][1]) fila1 = tbl1.add_row().cells fila1[0].text = fich[3][0] fila1[1].text = fich[3][1] # p.add_run('\n\n') # p.add_run(fich[3][0] + ' \t').bold = True # p.add_run(fich[3][1]) fila2 = tbl1.add_row().cells fila2[0].text = fich[4][0] fila2[1].text = anio # p.add_run('\n\n') # p.add_run(fich[4][0] + ' \t').bold = True # p.add_run(anio) dc1.add_heading(fich[5][0], 2) p1 = dc1.add_paragraph() # p1.add_run('\n') tbl2 = dc1.add_table(rows=0, cols=2) fila = tbl2.add_row().cells fila[0].text = fich[6][0] fila[1].text = fich[6][1] # p1.add_run(fich[6][0] + ' \t').bold = True # p1.add_run(fich[6][1]) fila1 = tbl2.add_row().cells fila1[0].text = fich[7][0] fila1[1].text = fich[7][1] # p1.add_run('\n\n') # p1.add_run(fich[7][0] + ' \t').bold = True # p1.add_run(fich[7][1]) dc1.add_heading(fich[8][0], 2) p2 = dc1.add_paragraph() # p2.add_run('\n') tbl3 = dc1.add_table(rows=0, cols=4) fila = tbl3.add_row().cells fila[0].text = fich[9][0] fila[1].text = fich[9][1] # p2.add_run(fich[9][0] + ' \t').bold = True # p2.add_run(fich[9][1]) fila1 = tbl3.add_row().cells fila1[0].text = fich[10][0] fila1[1].text = nota1 fila1[2].text = nota2 fila1[3].text = nota3 # p2.add_run('\n\n') # p2.add_run(fich[10][0] + ' \t').bold = True # p2.add_run(nota1+ ' \t'+nota2+ ' \t'+nota3+ ' \t') fila2 = tbl3.add_row().cells fila2[0].text = fich[11][0] fila2[1].text = promedio # p2.add_run('\n\n') # p2.add_run(fich[11][0] + ' \t').bold = True # p2.add_run(promedio) dc1.add_heading(fich[12][0], 2) p3 = dc1.add_paragraph() tbl4 = dc1.add_table(rows=0, cols=2) fila = tbl4.add_row().cells fila[0].text = fich[13][0] fila[1].text = fich[13][1] # p3.add_run('\n') # p3.add_run(fich[13][0] + ' \t').bold = True # p3.add_run(fich[13][1]) fila1 = tbl4.add_row().cells fila1[0].text = fich[14][0] fila1[1].text = fich[14][1] # p3.add_run('\n\n') # p3.add_run(fich[14][0] + ' \t').bold = True # p3.add_run(fich[14][1]) # dc1.add_picture('C:/Users/Angel/Desktop/prueba/excel/imgplot.png') """ nombimg = 'img' + str(aux) + '.jpg' num = int(nota) nomb = str(fich[2][1]) nombre = ("", nomb, "") posicion_y = np.arange(3) unidad = (0, num, 0) plt.barh(posicion_y, unidad, align="center") plt.yticks(posicion_y, nombre) plt.xlabel("NOTA") plt.title("NOTAS") plt.savefig(BASE_DIR+'/media/'+nombimg) dc1.add_picture(BASE_DIR+'/media/'+nombimg) os.remove(BASE_DIR+'/media/'+nombimg) """ dc1.add_page_break() nombre_archivo = "Proyecto.docx" response = HttpResponse(content_type="application/msword") contenido = "attachment; filename= {0}".format(nombre_archivo) response["Content-Disposition"] = contenido dc1.save(response) return response #return render(request, 'word/index.html', {'url':fs.url(name)}) return render(request, 'word/index.html')
def Doc(Pic_Address, Company, Invoice_num, Invoice_Date, Company_Address, Customer_Name, Customer_Address, Email, Phone, Detail_top, Customer_Phone, Product_List, Extra_Info, Discount, Amount): document = Document("default.docx") section = document.sections for i in section: i.left_margin = Inches(0.75) i.right_margin = Inches(0.5) i.top_margin = Inches(0.4) i.bottom_margin = Inches(0.4) Pics(document, Pic_Address) para = document.add_paragraph("") Line1 = Line_sep(Var1=Company, Seperator=40, Var2="Invoice", Full=60, Var3=Invoice_num) Line(para, Line1, Size=24, Bold=True, NextL=False) Line2 = Line_sep(Var1="", Seperator=129, Var2="Invoice Date", Full=161, Var3=Invoice_Date) Line(para, Line2, Size=11) Line(para, Company_Address, Size=11) Line(para, Line=Detail_top, Size=11) Line(para, Email, Size=11) Line(para, Phone, Size=11) #Customer Line3 = Line_sep(Var1="Bill To:", Seperator=90, Var2="Ship To:", Full=None, Var3="") Line(para, Line3, Size=11, Bold=True) Line(para, Line=Customer_Name, Size=13, Bold=True) Line(para, Line=Customer_Address, Size=11) Line(para, Line=Customer_Phone, Size=11) #table table = Create_table(document) Assign_table(table, Product_List) #after table para1 = document.add_paragraph("") Line(para1, Line=Extra_Info, Size=11, NextL=False) Sub_Total = Amount Line4 = Line_sep(Var1="", Seperator=80, Var2="Sub Total ", Full="SPECIAL", Var3=str(Sub_Total)) Line(para1, Line=Line4, Bold=True, Size=18) Line5 = Line_sep(Var1="", Seperator=81, Var2="Discount ", Full="SPECIAL", Var3=Discount) Line(para1, Line=Line5, Bold=True, Size=18) Grand = float(Sub_Total) - float(Discount) Line6 = Line_sep(Var1="", Seperator=57, Var2="Grand Total ", Full="SPECIAL", Var3=str(Grand)) Line(para1, Line=Line6, Bold=True, Size=22) Line(para1, Line=" ", Size=12) line7 = "\n\nCustomer Signature Signature" Line(para1, Line=line7, Size=12) document.save("Invoice\Invoice " + Invoice_num + ".docx")
def readingFile(path1, path2, mode): totalCompare = 0 errorCompare = 0 # using docx module document = Document() #opening the file fil = open(path1, "r+") fil1 = open(path2, "r+") #calculating the number of lines in the files countfil = 0 countfil1 = 0 print("calculating the length of file 1 : ") for i in fil: countfil += 1 os.system("cls") print("calculating the length of file 2 : ") for i in fil1: countfil1 += 1 count = 1 # calculating the length when the two files will be zipped for running for loop if (countfil > countfil1): totalCompare = countfil fromLen = countfil1 else: totalCompare = countfil1 fromLen = countfil # closing and reopening file - as their is some error ocurring if we dont do this - the zipped for loop was not running fil.close() fil1.close() os.system("cls") print("starting to compare : ") fil = open(path1, "r+") fil1 = open(path2, "r+") # running the for loop as both files zipped so that we can get the lines from both files at the same time for i, j in zip(fil, fil1): print("on line {} outOf {}".format(count, fromLen)) # getting the lines string1 = i.strip() string2 = j.strip() # if the mode is 1 converting both the captuared lines to lower case to remove case sensitivity if (mode == 1): string1 = string1.lower() string2 = string2.lower() else: pass # getting the individual words in the lines captured in string 1 and sting 2 - so that we can output the colored words which do not match myList1 = list(string1.split()) myList2 = list(string2.split()) myList1Len = len(myList1) myList2Len = len(myList2) # making the list of same length - inserting " " in the list which has less elements - to avoid any kind of error like while running for loop we may get a[i] - their can be nothing at i position - to remove this error we are doing this if (myList1Len == myList2Len): pass else: if (myList1Len > myList2Len): elementsToBeInserted = myList1Len - myList2Len for i in range(elementsToBeInserted): myList2.append(" ") else: elementsToBeInserted = myList2Len - myList1Len for i in range(elementsToBeInserted): myList1.append(" ") # now since both list are of same length and also have same elements so if list1 == list2 - then the string1 will be equal to string2 if (myList1 == myList2): pass # if the list is not same then looping over each element of both list and comparing individual elements else: # using docx stringHeading = "Error on line " + str(count) + " : " document.add_heading(stringHeading, level=1) p = document.add_paragraph(style='List Bullet') # starting to loop through each element on both list at the same time - as the both list are of same length so index will return same position element from both list # looping over for printing the data of the line in txt1 for index in range(len(myList1)): i = myList1[index] j = myList2[index] stringToadd = str(i) + " " # if the elements match then the output will be in black colour if (i == j): p.add_run(stringToadd) #if donot match then the word will be outputed in red colour else: errorCompare = errorCompare + 1 run = p.add_run(stringToadd) run.font.color.rgb = RGBColor(0xff, 0x00, 0x00) p = document.add_paragraph(style='List Bullet') # looping over again to print the data of line in txt2 for index in range(len(myList1)): i = myList1[index] j = myList2[index] stringToadd = str(j) + " " if (i == j): p.add_run(stringToadd) else: errorCompare = errorCompare + 1 run = p.add_run(stringToadd) run.font.color.rgb = RGBColor(0xff, 0x00, 0x00) # increasing the count to keep tarck were we have reached count += 1 errorCompare = errorCompare / 2 trackCount = count - 1 count = 0 # again closing and opening the file to avoid errors fil.close() fil1.close() os.system("cls") print("outputting extra lines founded : ") fil = open(path1, "r+") fil1 = open(path2, "r+") # if the txt1 has more lines than txt2 if (countfil > countfil1): stringHeading = "these are the extra lines after the " + str( countfil1) + " in txt file no1 : " document.add_heading(stringHeading, level=1) p = document.add_paragraph() # looping to get the elements in txt1 for i in fil: # will start printing after the length till we have compared - as we used zip above - we compare only till the length of the smaller txt file if (trackCount <= count): string1 = i.strip() p.add_run(string1) p.add_run("\n") errorCompare = errorCompare + 1 count += 1 # if the txt2 has more lines than txt2 elif (countfil < countfil1): stringHeading = "these are the extra lines after the " + str( countfil) + " in txt file no2 : " document.add_heading(stringHeading, level=1) p = document.add_paragraph() for i in fil1: if (trackCount <= count): string1 = i.strip() p.add_run(string1) p.add_run("\n") errorCompare = errorCompare + 1 count += 1 # using docx and finally outputting the data document.save('output_result.docx') fil.close() fil1.close() percentageMatched = 100 - ((errorCompare / totalCompare) * 100) return percentageMatched
from docx import Document from docx.shared import Inches document = Document() #创建文档 document.add_heading('Document Title', 0) #大标题等级为0 p = document.add_paragraph('A plain paragraph having some') #添加文本段落 p.add_run('bold').bold = True #添加文本为粗体 p.add_run(' and some ') p.add_run('italic.').italic = True #添加文本为斜体 document.add_heading('Heading, level 1', level=1) document.add_paragraph('Intense quote', style='IntenseQuote') #添加文本段落样式为Style document.add_paragraph('first item in unordered list', style='ListBullet') document.add_paragraph('fisrt item in ordered list', style='ListNumber') document.add_picture('python_logo.gif', width=Inches(1.25)) #添加图片宽度为1.25英寸 table = document.add_table(rows=1, cols=3) #创建了一行三列的表格 hdr_cells = table.rows[0].cells #取得这个表格的第一行 hdr_cells[0].text = 'Qty' #为列名赋值 hdr_cells[1].text = 'Id' hdr_cells[2].text = 'Desc' for item in range(3): row_cells = table.add_row().cells #添加新行 row_cells[0].text = str(item) #为每列赋值 row_cells[1].text = str(item * 100) row_cells[2].text = str(item) + '10' document.add_page_break() #添加分页符到文档末尾 document.save('hello.docx') #保存文档
#extract the variables name = values["name"] address = values["address"] salutation = values["salutation"] bodytext = values["bodytext"] signature = values["signature"] now = datetime.utcnow() todays_date = now.strftime('%Y-%m-%d') #prepare the letter try: document = Document('letterhead.docx') except: document = Document() p = document.add_paragraph(todays_date) p = document.add_paragraph(" ") p = document.add_paragraph(" ") p = document.add_paragraph(name + '\n' + address ) p = document.add_paragraph(" ") p = document.add_paragraph(" ") p = document.add_paragraph(salutation + " " + name + ",") p = document.add_paragraph(" ") p = document.add_paragraph(bodytext) p = document.add_paragraph(" ") p = document.add_paragraph(signature) document.save(name + " - " + todays_date +".docx") window['outputline'].update('Created ' + name + ' ' + todays_date) except Exception as e:
def saveToDocx(data, fileName): document = Document() document.add_heading("Email: " + data["email"]) document.add_paragraph("UUID: " + data["uuid"]) document.add_heading("Page Url:" + data["url"], level=2) document.add_heading("Title", level=3) document.add_paragraph(data["title"][0]) document.add_heading("HyperLink's", level=3) for item in data["link"]: document.add_paragraph(item) document.add_heading("Span's", level=3) for item in data["span"]: document.add_paragraph(item) document.add_heading("Paragraph's", level=3) for item in data["paragraph"]: document.add_paragraph(item) document.add_heading("Heading's 1", level=3) for item in data["heading 1"]: document.add_paragraph(item) document.add_heading("Heading's 2", level=3) for item in data["heading 2"]: document.add_paragraph(item) document.add_heading("Heading's 3", level=3) for item in data["heading 3"]: document.add_paragraph(item) document.add_heading("Heading's 4", level=3) for item in data["heading 4"]: document.add_paragraph(item) document.add_heading("Other's", level=3) for item in data["other"]: document.add_paragraph(item) document.save(fileName + ".docx")
class Transformer: def __init__(self, filename='no_file', count=[10]): self.document = Document() self.answer = Document() self.filename = filename self.count = count style = self.document.styles['Normal'] font = style.font font.name = 'Times New Roman' font.size = Pt(12) paragraph_format = style.paragraph_format paragraph_format.space_before = 0 paragraph_format.space_after = 0 paragraph_format.line_spacing = 1.15 def initiate(self): xl = xlrd.open_workbook(self.filename) sheetnames = xl.sheet_names() for idx, sheet_name in enumerate(xl.sheets()): total_rows = sheet_name.nrows total_required = self.count[idx] if(total_required >= total_rows): n = range(1,total_rows) else: n = random.sample(range(1, total_rows), total_required) p = self.document.add_paragraph() p.add_run(sheetnames[idx]).bold = True pa = self.answer.add_paragraph() pa.add_run(sheetnames[idx]).bold = True for i in n: q = self.document.add_paragraph(style='ListNumber') q.add_run(sheet_name.cell(i, 1).value).bold = True self.document.add_paragraph('a) '+str(sheet_name.cell(i, 2).value), style='List') self.document.add_paragraph('b) '+str(sheet_name.cell(i, 3).value), style='List') self.document.add_paragraph('c) '+str(sheet_name.cell(i, 4).value), style='List') self.document.add_paragraph('d) '+str(sheet_name.cell(i, 5).value), style='List') self.document.add_paragraph( style='List') qa = self.answer.add_paragraph(style='ListNumber') qa.add_run(sheet_name.cell(i, 1).value).bold = True self.answer.add_paragraph('a) '+str(sheet_name.cell(i, 2).value), style='List') self.answer.add_paragraph('b) '+str(sheet_name.cell(i, 3).value), style='List') self.answer.add_paragraph('c) '+str(sheet_name.cell(i, 4).value), style='List') self.answer.add_paragraph('d) '+str(sheet_name.cell(i, 5).value), style='List') aa = self.answer.add_paragraph() aa.add_run("Answer: "+sheet_name.cell(i, 6).value.lower()).bold = True self.answer.add_paragraph( style='List') return self.document, self.answer
def create(self, data=[]): if not os.path.exists('C:/LinkedinDocs'): os.mkdir('C:/LinkedinDocs') for user in data: document = Document() currentTime = time.time() currentTime = str(currentTime).split('.')[1] document.add_heading( user.get('personal_info', "Error 1").get('name', "Error 2")) document.add_heading(user.get('personal_info', "Error 1").get( 'headline', "Error 4"), level=2) phone = user.get('personal_info', "Error 1").get('phone', 'Phone error') email = user.get('personal_info', "Error 1").get('email', 'email Error') table = document.add_table(rows=1, cols=2) cell1 = table.cell(0, 0) cell1.text = str(phone) cell2 = table.cell(0, 1) cell2.text = str(email) document.add_paragraph( user.get('personal_info', "Error 1").get('summary', "Error 5")) document.add_heading("Experience", level=2) for job in user.get('experiences', "Error 3").get('jobs', "Error 4"): document.add_paragraph(job.get('title'), style='List Bullet') document.add_paragraph(job.get('date_range')) document.add_paragraph( str(job.get('description')).replace("\n", "").strip()) document.add_heading("Skills", level=2) for skill in user.get('skills'): document.add_paragraph(skill.get('name'), style='List Bullet') document.add_heading("Education", level=2) for edu in user.get('experiences', "Error 3").get('education', "Error 4"): document.add_paragraph(str(edu.get('field_of_study')) + '\t' + str(edu.get('date_range')), style='List Bullet') document.add_paragraph(edu.get('degree', '')) document.add_paragraph(edu.get('name', "None")) name = str( user.get('personal_info', "Error 1").get('name', "Error 2")) result = re.sub(r'[^a-zA-Z]', " ", name) result = re.sub(' +', ' ', result) document.save('C:/LinkedinDocs/' + result.strip() + '.docx')
# Opening a document document = Document() # Adding a heading ''' In anything but the shortest document, body text is divided into sections, each of which starts with a heading. Here’s how to add one, by default: level=1 ''' document.add_heading('The REAL meaning of the universe') document.add_heading('The role of dolphins', level=2) # Adding a paragraph paragraph = document.add_paragraph('Lorem ipsum dolor sit amet.') ''' When you add a paragraph by providing text to the .add_paragraph() method, it gets put into a single run. You can add more using the .add_run() method on the paragraph: ''' paragraph = document.add_paragraph('Add more text ') paragraph.add_run('by .add_run() method.') # Applying bold and italic paragraph = document.add_paragraph('Applying ') paragraph.add_run('bold ').bold = True paragraph.add_run('and ') paragraph.add_run('italic.').italic = True # Applying a character style
section.left_margin = Inches(1) section.right_margin = Inches(1) section.page_width = Inches(6.25) section.page_height = Inches(9.25) letters = [l for l in "abcdefghijklmnopqrstuvwxyz .,"] length = len(letters) width = 56 odd = True for i in range(410): num = str(i + 1) print('Page ' + num) for j in range(46): line = ''.join([letters[int(random() * length)] for _ in range(width)]) doc.add_paragraph(line) doc.add_paragraph(' ') doc.add_paragraph(' ') if odd is True: doc.add_paragraph(num) else: doc.add_paragraph((' ' * (width - len(num))) + num) odd = not odd doc.save('out/book.docx') # Generate hard cover from PIL import Image, ImageFont, ImageDraw size = (2956, 2100)
def CreateWordListDocument( inputFile, outputFile=None, ): basename = "" (folder_path, shortname) = os.path.split(inputFile) (basename, extension) = os.path.splitext(shortname) if (outputFile is None): filename_docx_string = basename + r'.docx' outputFile = os.path.join(folder_path, filename_docx_string) #Process the text data, split the English and Chinese sentences/vocabulary word_list = generateWordList(inputFile) num = len(word_list) if (num == 0): print( f"\n[Error]: There is no vocabulary in the text file {inputFile}, abort! \n" ) return None title = basename + " " + "word list" title = title.title() sent_title = basename + " " + "sentences examples" sent_title = sent_title.title() sentences_list = [] Page_topMargin = Cm(1.27) Page_bottomMargin = Cm(1.27) Page_leftMargin = Cm(1.27) Page_rightMargin = Cm(1.27) #For A4 page orientation Landscape, if portrait just swap height and width A4_Page_Height = Cm(29.7) A4_Page_Width = Cm(21) #29.7-1.27*2 = 27.16 #6.5*2+7*2 = 27 Word_column_width = Cm(3.5) Phonetic_column_width = Cm(3.5) POS_column_width = Cm(2) # 21-1.27*2 -9 = Acceptation_column_width = Cm(9.4) #Init the document document = Document() #init the font style style = document.styles['Normal'] style.font.name = 'Tahoma' style.element.rPr.rFonts.set(qn('w:eastAsia'), '微软雅黑') style.font.size = Pt(12) #changing the page margins sections = document.sections for section in sections: section.orientation = WD_ORIENT.PORTRAIT # set the page landscape or portrait if (section.orientation == WD_ORIENT.LANDSCAPE): section.page_width = A4_Page_Height section.page_height = A4_Page_Width else: section.page_width = A4_Page_Width section.page_height = A4_Page_Height section.top_margin = Page_topMargin section.bottom_margin = Page_bottomMargin section.left_margin = Page_leftMargin section.right_margin = Page_rightMargin #Add The title p = document.add_heading(title, 0) #Add the Table table = document.add_table(1, 4) table.alignment = WD_TABLE_ALIGNMENT.CENTER table.style = 'Table Grid' table.autofit = False table.allow_autofit = False HEADER_ROW = 0 hdr_cells = table.rows[HEADER_ROW].cells #add table header hdr_cells[0].text = "Word" hdr_cells[0].width = Word_column_width hdr_cells[1].text = "Phonetic" hdr_cells[1].width = Phonetic_column_width hdr_cells[2].text = "POS" hdr_cells[2].width = POS_column_width hdr_cells[3].text = "Acceptation" hdr_cells[3].width = Acceptation_column_width for cell in hdr_cells: run = cell.paragraphs[0].runs[0] run.font.bold = True run.font.size = Pt(12) for word_i in tqdm(word_list, desc="Lookup words", total=len(word_list), unit="words"): lookup_result = lookupword(word_i) if lookup_result is not None: sentences_list.append({ "word": lookup_result['word'], "sent": lookup_result['sent'] }) #first insert necessary rows pos_num = len(lookup_result['pos']) for ss in range(pos_num): table.add_row() current_row = len(table.rows) - 1 current_cells = table.rows[current_row].cells current_cells[0].text = lookup_result['word'] current_cells[0].vertical_alignment = WD_ALIGN_VERTICAL.CENTER current_cells[0].width = Word_column_width current_cells[1].text = lookup_result['phonetic'] current_cells[1].vertical_alignment = WD_ALIGN_VERTICAL.CENTER current_cells[1].width = Phonetic_column_width current_cells[2].text = lookup_result['pos'][ss] current_cells[2].vertical_alignment = WD_ALIGN_VERTICAL.CENTER current_cells[2].width = POS_column_width current_cells[3].text = lookup_result['acceptation'][ss] current_cells[3].vertical_alignment = WD_ALIGN_VERTICAL.CENTER current_cells[3].width = Acceptation_column_width if (ss >= 1): for col in range(2): # first 2 columns merged seperately up_cell = table.cell(current_row - 1, col) down_cell = table.cell(current_row, col) down_cell.merge(up_cell) table.cell(current_row, 0).text = lookup_result['word'] table.cell(current_row, 0).vertical_alignment = WD_ALIGN_VERTICAL.CENTER table.cell(current_row, 0).width = Word_column_width table.cell(current_row, 1).text = lookup_result['phonetic'] table.cell(current_row, 1).vertical_alignment = WD_ALIGN_VERTICAL.CENTER table.cell(current_row, 1).width = Phonetic_column_width columns = table.columns columns[0].width = Word_column_width columns[1].width = Phonetic_column_width columns[2].width = POS_column_width columns[3].width = Acceptation_column_width #Add example sentences if len(sentences_list): #Add The title p = document.add_heading("\n\n" + sent_title, 0) for index, result in enumerate(sentences_list): word = result['word'] sentences = result['sent'] if (sentences is not None and len(sentences)): p = document.add_heading(f"{index+1:>3}. " + word, 1) for s_i, sent in enumerate(sentences): English = " ".join(sent.orig.string.splitlines()) Chinese = "".join(sent.trans.string.splitlines()) Example = f"({(s_i+1):>2} ). " + English + "\n" paragraph = document.add_paragraph(Example, style='List') #添加英文句子 paragraph.add_run(Chinese) paragraph.paragraph_format.space_before = Pt(10) document.save(outputFile) print( f"\nVocabulary translation document {outputFile} generated successfully!\n" ) return num