def fasterSolution(self, b): self.progress['value'] = time.time() - self.start_time self.progress.update() self.document = Document() self.COLUMNS = 4 self.ROWS = 1000 self.table = self.document.add_table(rows=self.ROWS, cols=self.COLUMNS) self.table_cells = self.table._cells for i in range(self.ROWS): self.row_cells = self.table_cells[i * self.COLUMNS:(i + 1) * self.COLUMNS] for cell in self.row_cells: self.paragraph = cell.paragraphs[0] self.run = self.paragraph.add_run() self.run.add_picture(self.folderAddress + "\\" + self.b + ".png", width=350000 * 0.71, height=350000 * 0.49) self.progress['value'] = time.time() - self.start_time self.progress.update() self.document.save(self.folderAddress + "\\" + "singleFolder" + "\\" + self.b + "_" + str(1) + ".docx") self.progress['value'] = time.time() - self.start_time self.progress.update()
def it_has_part_as_header_part(self): document = Document(dir_pkg_path) header = document.add_header() paragraph = header.add_paragraph() run = paragraph.add_run() run.add_floating_picture(python_powered_path) document.save('/home/daniel/faboozle.docx')
def it_removes_header_part(self): document = Document(dir_pkg_path) document.remove_headers() for rel_id, part in document.part.related_parts.items(): assert part.content_type != CT.WML_HEADER header_elm_tag = 'w:headerReference' sentinel_sectPr = document._body._body.get_or_add_sectPr() header_elms = sentinel_sectPr.findall(qn(header_elm_tag)) assert len(header_elms) == 0
def it_removes_footer_part(self): document = Document(dir_pkg_path) document.remove_footers() for rel_id, part in document.part.related_parts.items(): assert part.content_type != CT.WML_FOOTER footer_elm_tag = "w:footerReference" sentinel_sectPr = document._body._body.get_or_add_sectPr() footer_elms = sentinel_sectPr.findall(qn(footer_elm_tag)) assert len(footer_elms) == 0
def fasterSolution(folderAddress, pic): document = Document() COLUMNS = 4 table = document.add_table(rows=1000, columns=COLUMNS) table_cells = table._cells for i in range(ROWS): row_cells = table_cells[i * COLUMNS:(i + 1) * COLUMNS] for cell in row_cells.cells: paragraph = cell.paragraphs[0] run = paragraph.add_run() run.add_picture(folderAddress + pic + ".png", width=350000 * 1.42, height=700000 * 0.49)
def finalizer(filed=[]): files = [ "static/merge/planning and organising.docx", "static/merge/customer service.docx", "static/merge/leadership.docx", "static/merge/commercial awareness.docx", "static/merge/initiative.docx", "static/merge/persuasive oral communication.docx" ] output_doc = Document("static/merge/nicopon.docx") for file in filed: if file in files: pass else: files.append(file) for index, file in enumerate(files): try: input_doc = Document(file) paro = 0 if index < len(files): output_doc.add_page_break() for para in input_doc.paragraphs: get_para_data(output_doc, para, paro) paro += 1 except Exception as e: print("mhhh") print(e) p = output_doc.add_paragraph() r = p.add_run() r.add_picture('static/end.PNG', width=Inches(7.09), height=Inches(8.76)) output_doc.save('static/merge/report.docx')
def it_adds_to_doc_without_header(self): document = Document(dir_pkg_path) header = document.add_header() header_elm_tag = 'w:headerReference' sentinel_sectPr = document._body._body.get_or_add_sectPr() header_elms = sentinel_sectPr.findall(qn(header_elm_tag)) assert len(header_elms) == 1 assert header assert len(header.paragraphs) == 0 header.add_paragraph('foobar') assert len(header.paragraphs) == 1
def putIntoDocumentFiles(folderAddress, pic): document = Document() table = document.add_table(rows=100, cols=4) #ROWS=25 For 100 barcodes for row in table.rows: for cell in row.cells: paragraph = cell.paragraphs[0] run = paragraph.add_run() run.add_picture( folderAddress + "\\" + pic + ".png", width=350000 * 0.71, height=350000 * 0.49 ) #(width,height)=>dimensions(singleTableRow,singleTableColumn) for singlePage document.save(folderAddress + "\\singleFolder" + "\\" + pic + "_" + "1" + ".docx")
def it_adds_to_doc_without_footer(self): document = Document(dir_pkg_path) document.remove_footers() footer = document.add_footer() footer_elm_tag = 'w:footerReference' sentinel_sectPr = document._body._body.get_or_add_sectPr() footer_elms = sentinel_sectPr.findall(qn(footer_elm_tag)) assert len(footer_elms) == 1 assert footer assert len(footer.paragraphs) == 0 footer.add_paragraph('foobar') assert len(footer.paragraphs) == 1
def it_adds_to_doc_without_footer(self): document = Document(dir_pkg_path) document.remove_footers() footer = document.add_footer() footer_elm_tag = "w:footerReference" sentinel_sectPr = document._body._body.get_or_add_sectPr() footer_elms = sentinel_sectPr.findall(qn(footer_elm_tag)) assert len(footer_elms) == 1 assert footer assert len(footer.paragraphs) == 0 footer.add_paragraph("foobar") assert len(footer.paragraphs) == 1
def add_picture_fixture(self, request, run_, picture_): document = Document() image_path_ = instance_mock(request, str, name='image_path_') width, height = 100, 200 class_mock(request, 'docx.text.Run', return_value=run_) run_.add_picture.return_value = picture_ return (document, image_path_, width, height, run_, picture_)
def get_grouplist(self, filename): doc = Document(filename) table = doc.tables[0] # Data will be a list of rows represented as dictionaries # containing each row's data. data = [] keys = None for i, row in enumerate(table.rows): text = (cell.text for cell in row.cells) # Establish the mapping based on the first row # headers; these will become the keys of our dictionary if i == 0: keys = tuple(text) continue # Construct a dictionary for this row, mapping # keys to values for this row row_data = dict(zip(keys, text)) data.append(row_data) grouplist = [] for dict_ in data: fio = dict_.get('Фамилия, имя, отчество ') grouplist.append(fio) return grouplist
def __init__( self, filename, ): self.filename = filename self.document = Document(filename) self.outlist = []
def read_table(docv): document = Document(docv) table = document.tables[1] table_info = document.tables[0] data = [] info = [] keys = None for i, row in enumerate(table_info.rows): text = [cell.text for cell in row.cells] info.append(text) for i, row in enumerate(table.rows): text = (cell.text for cell in row.cells) tr = [cell.text for cell in row.cells] hu = len(tr) if hu == 4: keys = ("number", "title", "comments", "no") elif hu == 3: keys = ("number", "title", "comments") elif hu == 2: keys = ("title", "comments") row_data = dict(zip(keys, text)) data.append(row_data) info = [info[0][1], info[0][3], info[1][1], info[1][3]] namer(info[0]) session["inf"] = info #info candidate,excercise,assesor,date would be good if stored in sessions return data
def parse(): data = [] dirpath = ent_path.get() for path in os.listdir(): full_path = os.path.join(dirpath, path) if os.path.isfile(full_path) and path.endswith(".docx"): document = Document(full_path) if len(document.tables) == 0: continue table = document.tables[0] data.append([]) flag = False for row in table.rows: for cell in iter_unique_cells(row): for para in cell.paragraphs: text = para.text.strip() if not len(text): continue if text[-1] == ':': data[-1].append('') flag = True else: texts = text.split(':') if len(texts) == 1 and flag: data[-1][-1] += text elif len(texts) > 1: data[-1].append(texts[1]) flag = False df = pd.DataFrame(data, columns=keys) df.to_excel("output.xlsx", index=False)
def it_has_rel_as_footer_rel(self): document = Document(dir_pkg_path) footer_rel_exists = False for rel_id, rel in document.part.rels.items(): if rel.reltype == RT.FOOTER: footer_rel_exists = True assert footer_rel_exists
def it_has_rel_as_header_rel(self): document = Document(dir_pkg_path) header_rel_exists = False for rel_id, rel in document.part.rels.items(): if rel.reltype == RT.HEADER: header_rel_exists = True assert header_rel_exists
def readDocxParagraph(name): str = '' document = Document(name) for para in document.paragraphs: str = str + para.text + u" endpara " str = str.lower() return str
def add_picture_fixture(self, request, Document_inline_shapes_, inline_shapes_): width, height, expected_width, expected_height = request.param document = Document() image_path_ = instance_mock(request, str, name='image_path_') picture_ = inline_shapes_.add_picture.return_value picture_.width, picture_.height = 200, 100 return (document, image_path_, width, height, inline_shapes_, expected_width, expected_height, picture_)
def file_tables(f,filename): # returns tables in document # print("parse files") document_tables = [] document = Document(f) #currently only supports docx files for t in document.tables: #m # print("__ table __ ") # print(t) document_tables.append(t) return document_tables
def it_has_part_as_header_part(self): document = Document(dir_pkg_path) header_part_exists = False for rel_id, part in document.part.related_parts.items(): if part.content_type == CT.WML_HEADER: header_part_exists = True assert isinstance(part, XmlPart) assert header_part_exists
def convert_table_to_df(document_name, table_nos=[]): document = Document(document_name) outlist = [] if table_nos == '': table_nos = list(range(len(document.tables))) for table in table_nos: input_table = document.tables[table] table_dataframe = table_to_df(input_table) outlist.append(table_dataframe) return outlist
def __init__(self, context, request=None): """Read the docx template and initialize some instance attributes that will be used to compile the template """ self.context = context self.request = request self.template = Document(self._template_filename) self.use_existing_measures = False self.tool_type = get_tool_type(self.context) self.tti = getUtility(IToolTypesInfo) self.italy_special = False
def readDocxTable(name): str = '' document = Document(name) tables = document.tables for table in tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: str = str + paragraph.text + u" endpara " str = str.lower() return str
def import_data(filename): conn = lite.connect("input.db") cur = conn.cursor() cur.execute( "create table IF NOT EXISTS input('id' integer, 'description' text, 'datetime' integer, 'longitute' real, 'latitude' real, 'elevation' integer)" ) cur.execute("select count(*) from input") count = cur.fetchall() if (count[0][0] == 0): i = 1 document = Document(filename) paragraph = document.paragraphs[0] while (len(paragraph.text) != 0): document = Document(filename) paragraph = document.paragraphs[i] col = re.split(r'\t+', paragraph.text) cur.execute("insert into input values(?,?,datetime(?),?,?,?)", (col[0], col[1], col[2], col[3], col[4], col[5])) conn.commit() i = i + 1 conn.close()
def __init__(self, context, request=None): """Read the docx template and initialize some instance attributes that will be used to compile the template """ self.context = context self.request = request self.template = Document(self._template_filename) self.use_existing_measures = api.portal.get_registry_record( "euphorie.use_existing_measures", default=False) self.tool_type = get_tool_type(self.webhelpers._survey) self.tti = getUtility(IToolTypesInfo) self.italy_special = self.webhelpers.country == "it"
def word_file_data(fileLocation): document = Document(fileLocation) table = document.tables[0] fileContentList = [] for rw in table.rows: value = rw.cells[0].text if value == '': pass else: fileContentList.append(value) print(fileContentList) return fileContentList
def parse_doc(): if request.method=='POST': filename=request.files['img'] #print(filename) txt=list() logging.getLogger().setLevel(logging.INFO) logging.info('File Name {}'.format(filename)) fileextens=filename.filename.split('.')[1] print(fileextens) if fileextens=='docx': logging.info('File name ends with docx') doc=Document(filename) for para in doc.paragraphs: txt.append(para.text) full_t= ' '.join(txt) name_e=ex.name_extraction(full_t) session['name_session']=name_e logging.info('Extracted Name {}'.format(name_e)) mob=ex.extract_mob_number(full_t) session['mob_session']=mob logging.info('Extracted Mobile {}'.format(mob)) mail=ex.extract_mail(full_t) session['mail_session']=mail logging.info('Extracted mail {}'.format(mail)) print(session.get('mail_session')) skills=ex.extract_skills(doc) logging.info('Extracted Skills {}'.format(skills)) session['skills_session']=skills if not mail or not name_e or not mob or not skills: txt_doc=ex.txt_extraction(filename) if not mob: mob_doc=ex.extract_mob_number(txt_doc) print(mob_doc) if not mail: mail_doc=ex.extract_mail(txt_doc) print(mail_doc) if not name_e: name_doc=ex.name_extraction(txt_doc) print(name_doc) if not skills: url=ex.extract_linkedinurl(txt_doc) skills=linkedin.skills_linkdn(url) print(skills) print("skills") return "Skills" return "File Not Uploaded"
def parse_all_docxs_table(): path = './src/data/basket_elective_docxs/' for filename in os.listdir(path): # print(filename) document = Document(path + filename) #read from docs file for j in range(len(document.tables)): table = document.tables[j] data = [] keys = None fname = "" for i, row in enumerate(table.rows): if i != 0: text = (cell.text for cell in row.cells) if i == 1: keys = tuple(text) continue row_data = dict(zip(keys, text)) data.append(row_data) else: # pass text = (cell.text for cell in row.cells) fname = tuple(text)[0] + '.xlsx' fname = fname.replace(" ", "_") fname = fname.replace("/", "_") fname = fname.replace(":", "_") fname = './src/tmp/baskets/' + fname if str(os.path.isfile(fname)) == False: print(os.path.isfile(fname)) workbook = xlwt.Workbook(fname) ws = workbook.add_sheet('Tested') workbook.save(fname) print(fname, ' is created') # print(data) # # print(len(data)) # print(fname) workbook = xlwt.Workbook(fname) ws = workbook.add_sheet('Tested') workbook.save(fname) wb = openpyxl.Workbook() sheet = wb.active sheet.cell(row=1, column=1).value = 'Course code' sheet.cell(row=1, column=2).value = 'Course name' sheet.cell(row=1, column=3).value = 'L-T-P-C' count = 2 for i in range(len(data)): sheet.cell(row=count, column=1).value = data[i][ 'Course code'][:2] + " " + data[i]['Course code'][2:] sheet.cell(row=count, column=2).value = data[i]['Course name'] sheet.cell(row=count, column=3).value = data[i]['L-T-P-C'] count = count + 1 wb.save(fname)
def fasterSolution(folderAddress, pic, progress): progress['value'] = time.time() - start_time progress.update() document = Document() COLUMNS = 4 ROWS = 1000 table = document.add_table(rows=ROWS, cols=COLUMNS) table_cells = table._cells for i in range(ROWS): row_cells = table_cells[i * COLUMNS:(i + 1) * COLUMNS] for cell in row_cells: paragraph = cell.paragraphs[0] run = paragraph.add_run() run.add_picture(folderAddress + "\\" + pic + ".png", width=350000 * 0.71, height=350000 * 0.49) progress['value'] = time.time() - start_time progress.update() document.save(folderAddress + "\\" + "singleFolder" + "\\" + pic + "_" + str(1) + ".docx") progress['value'] = time.time() - start_time progress.update()
def it_adds_to_doc_without_header(self): document = Document(dir_pkg_path) sentinel_sectPr = document.sections[0] header_elm_tag = 'w:headerReference' header = sentinel_sectPr.add_header() header_elms = sentinel_sectPr.findall(qn(header_elm_tag)) assert len(header_elms) == 1 assert header assert len(header.paragraphs) == 0 header.add_paragraph('foobar') assert len(header.paragraphs) == 1
def parse_summary(assignment, file): document = Document(file) regex = re.compile("[A-Z][A-Z][A-Z] [0-9][0-9][0-9][0-9]") course_title = '' for paragraph in document.paragraphs: result = regex.match(paragraph.text) if(result): course_title = result.group() break if assignment: assignment = assignment.replace('\n', ', ') summary = course_title + ' ' + assignment else: summary = 'N/A' return summary
def getTable(filename, n=0): from docx.api import Document document = Document(filename) table = document.tables[n] data = [] keys = None for i, row in enumerate(table.rows): text = (cell.text for cell in row.cells) if i == 0: keys = tuple(text) continue row_data = dict(zip(keys, text)) data.append(row_data) print(data)
def read_table(docv): document = Document(docv) table = document.tables[1] data = [] keys = None for i, row in enumerate(table.rows): text = (cell.text for cell in row.cells) keys = ("number", "title", "comments", "no") ''' if i == 0: keys = tuple(text) continue''' row_data = dict(zip(keys, text)) data.append(row_data) return data
def it_should_raise_if_not_a_Word_file(self, Package_, package_, docx_): package_.main_document.content_type = 'foobar' with pytest.raises(ValueError): Document._open(docx_)
def it_opens_default_template_if_no_file_provided( self, Package_, default_docx_): Document._open(None) Package_.open.assert_called_once_with(default_docx_)
def it_can_open_a_docx_file(self, open_fixture): docx_, Package_, package_, document_part_ = open_fixture document_part, package = Document._open(docx_) Package_.open.assert_called_once_with(docx_) assert document_part is document_part assert package is package_