def test_extract_with_span(self): """ Extract a text with a bold/italic span """ poem_odt = os.path.join( os.path.dirname(__file__), u"examples", u"simplestyles.odt") d = load(poem_odt) teletype.extractText(d.body) self.assertEqual(u'Plain textBoldItalicBold italicUnderlineUnderline italicUnderline bold italicKm2 - superscriptH2O - subscript', teletype.extractText(d.body))
def test_extract_with_span(self): """ Extract a text with a bold/italic span """ poem_odt = os.path.join( os.path.dirname(__file__), "examples", "simplestyles.odt") d = load(poem_odt) teletype.extractText(d.text) self.assertEqual(u'Plain textBoldItalicBold italicUnderlineUnderline italicUnderline bold italicKm2 - superscriptH2O - subscript', teletype.extractText(d.text))
def _get_conclusion_from_report(report): conclusion = [] try: doc = load(report.path) except FileNotFoundError: return 'Не найдено' paragraphs = doc.getElementsByType(text.P) for i in range(len(paragraphs)): if teletype.extractText( paragraphs[i]).strip().startswith('Заключение:'): conclusion.append( teletype.extractText(paragraphs[i]).replace( 'Заключение:', '').strip()) return '; '.join(conclusion)
def convert_text_only(self): """Конвертация текста в файле ODT. Конвертируется все содерржимое <body>. Результат записывается в файл *.cnv.odt :return: None """ body = self.doc.body new_doc = OpenDocumentText() for _body_elem in body.childNodes: for _elem in _body_elem.childNodes: body_text = teletype.extractText(_elem) body_text = self.converter(body_text) para = text.P() teletype.addTextToElement(para, body_text) new_doc.text.addElement(para) # print(body_text) # Замена шрифта в стилях. if self.style_font: self.set_font_for_all_styles() _suffix = '.all.odt' if self.extension: _suffix = self.extension new_odt = self.p_odt.with_suffix(_suffix) new_doc.save(new_odt.as_posix())
def load_file(self, file): textdoc = load(file) allparas = textdoc.getElementsByType(text.P) allparas = [teletype.extractText(_) for _ in allparas] allparas = [_ for _ in allparas if len(_) > 20] self.allparas = allparas self.is_valid = True
def getODTContent(filename): list = [] textdoc = load(filename) allparas = textdoc.getElementsByType(text.P) for i in range(len(allparas)): list.append(teletype.extractText(allparas[i])) return ' '.join(list)
def cargar_desde_carpeta(dataset): txt_declaracion = [] n = 0 for item in dataset: n = n + 1 print(n) id_file = extraer_id(item) if id_file: try: destination = 'doc/myfile_' + id_file + '.odt' textdoc = load(destination) allparas = textdoc.getElementsByType(text.P) texto = [] for i in allparas: texto.append(teletype.extractText(i)) texto = ' '.join(texto) except: try: from tika import parser destination = 'doc/myfile_' + id_file + '.pdf' file_data = parser.from_file(destination) texto = file_data['content'] except: texto = "Archivo no encontrado" txt_declaracion.append(texto) else: txt_declaracion.append('NADA') return txt_declaracion
def __replace_text_in_odf_element(a_file, a_element_foo, a_replace_map: list): replace_map = {} for element in a_file.getElementsByType(a_element_foo): text = teletype.extractText(element) for mark in a_replace_map: text = text.replace(mark[0], mark[1]) new_odf_element = odf_text.P() new_odf_element.setAttribute("stylename", element.getAttribute("stylename")) for space_elements in element.getElementsByType(odf_text.S): # Без этого все пробельные символы в начале строк удалятся spaces = space_elements.getAttribute('c') if spaces is not None: new_space_element = odf_text.S() new_space_element.setAttribute('c', spaces) new_odf_element.appendChild(new_space_element) new_odf_element.addText(text) replace_map[element] = new_odf_element for old, new in replace_map.items(): old.parentNode.insertBefore(new, old) old.parentNode.removeChild(old) # Без этого дерево нодов сломается a_file.rebuild_caches(new.parentNode)
def openFile(): filename = tk.filedialog.askopenfilename(initialdir ='C:\\') if len(filename)==0: print("open file for reading is cancelled.") return filetype=filename.split(".") filetype=filetype[len(filetype)-1] if(filetype=="txt"): try: file=open(filename,'r') t=create_newtext() t.insert('end',file.read()) file.close() except IOError as e: print(e,filename) elif(filetype=="odt"): textdoc = load(filename) allparas = textdoc.getElementsByType(text.P) t=create_newtext() for i in range(len(allparas)): t.insert('end',teletype.extractText(allparas[i])) else: tk.messagebox.showinfo( title='Unsupported file type', message='This is unsupported file type, now the supported file type are .txt and .odt')
def getODTText(self, file): #odt datei wird gelesen fileName = file textdoc = load(fileName) allparas = textdoc.getElementsByType(text.P) outputlist = [] for x in allparas: outputlist.append(teletype.extractText(x)) self.FinalOutputText = "".join(outputlist)
def odt_analysis(file, pattern, path): """Finds the pattern in text (and images) for .odt files""" counter = 0 textdoc = load(file) all_paragraphs = textdoc.getElementsByType(otext.P) for paragraph in all_paragraphs: counter += teletype.extractText(paragraph).lower().count(pattern) show(path, file, counter)
def __read_text_cell(self, cell): text_content = [] paragraphs = cell.getElementsByType(P) # for each text node for paragraph in paragraphs: data = extractText(paragraph) text_content.append(data) return '\n'.join(text_content)
def parse_odt(response): tmp = tempfile.NamedTemporaryFile() with open(tmp.name, "wb") as f: f.write(response.content) odtfile = opendocument.load(tmp.name) texts = odtfile.getElementsByType(text.P) return " ".join(teletype.extractText(t) for t in texts)
def get_text(self): out_text = '' body = self.doc.body for _body_elem in body.childNodes: for _elem in _body_elem.childNodes: body_text = teletype.extractText(_elem) out_text += f'{body_text}\n' return out_text
def 讀odt檔(檔名): if '.odt' not in 檔名: exit(0) textdoc = load(檔名) 內容陣列 = [] for line in textdoc.getElementsByType(text.P): tsua = teletype.extractText(line).strip() 內容陣列.append(tsua) return 內容陣列
def __read_text_cell(self, cell): text_content = [] paragraphs = cell.getElementsByType(P) # for each text node for paragraph in paragraphs: name_space, tag = paragraph.parentNode.qname if tag != str("annotation"): data = extractText(paragraph) text_content.append(data) return "\n".join(text_content)
def extraer_texto_odt(file_id): destination = 'doc/myfile_' + file_id + '.odt' download_from_drive(file_id, destination) textdoc = load(destination) allparas = textdoc.getElementsByType(text.P) texto = [] for i in allparas: texto.append(teletype.extractText(i)) texto = ' '.join(texto) return texto
def noteline_identification(self, ligne): text = teletype.extractText(ligne) for i in range(len(text)): if (self.check_char_note(text[i])): if (text[i + 1] == " " or text[i + 1] == "#" or (text[i + 1] == "b" and (text[i + 2] == " " or text[i + 2].lower == "m" or text[i + 2] == "/"))): return True return False
def odf_to_text(document_path, event_handler): from odf.opendocument import load as odf_load from odf import text as odf_text from odf import teletype as odf_teletype doc = odf_load(document_path) paragraphs = [] for p in doc.getElementsByType(odf_text.P): paragraphs.append(odf_teletype.extractText(p)) return '\n'.join(paragraphs).strip()
def test_extract(self): """ Convert a paragraph to plain text """ poem_odt = os.path.join(os.path.dirname(__file__), u"examples", u"serious_poem.odt") d = load(poem_odt) allparas = d.getElementsByType(P) content = u"""<text:p text:style-name="Standard">The boy stood <text:s text:c="3"/>on the burning deck,<text:line-break/><text:tab/>Whence all<text:tab/>but<text:tab/><text:tab/>him had fled.<text:line-break/>The flames <text:s text:c="2"/>that lit<text:tab/>the battle's<text:tab/>wreck,<text:line-break/> <text:s text:c="11"/>Shone o'er him, round the dead. <text:s text:c="2"/></text:p>""" self.assertEqual( u"The boy stood on the burning deck,\n\tWhence all\tbut\t\thim had fled.\nThe flames that lit\tthe battle's\twreck,\n Shone o'er him, round the dead. ", teletype.extractText(allparas[0]))
def odftext(self): """Extract texts from .odt/.ods/.odp files""" odf_file = odfload(self.path) odf_text = odf_file.getElementsByType(odftext.P) text = '' for para in odf_text: t = teletype.extractText(para) text = text + t + ' ' text = text.replace("'", "‘") return text
def test_extract(self): """ Convert a paragraph to plain text """ poem_odt = os.path.join(os.path.dirname(__file__), "examples", "serious_poem.odt") d = load(poem_odt) allparas = d.getElementsByType(P) content = """<text:p text:style-name="Standard">The boy stood <text:s text:c="3"/>on the burning deck,<text:line-break/><text:tab/>Whence all<text:tab/>but<text:tab/><text:tab/>him had fled.<text:line-break/>The flames <text:s text:c="2"/>that lit<text:tab/>the battle's<text:tab/>wreck,<text:line-break/> <text:s text:c="11"/>Shone o'er him, round the dead. <text:s text:c="2"/></text:p>""" self.assertEqual( u"The boy stood on the burning deck,\n\tWhence all\tbut\t\thim had fled.\nThe flames that lit\tthe battle's\twreck,\n Shone o'er him, round the dead. ", teletype.extractText(allparas[0]), )
def switch_notes(self, qte): for i in range(len(self.lignes)): #print(self.allText[self.lignes[i]]) #ligne de notes old_text = teletype.extractText(self.allText[self.lignes[i]]) newer_text = "" new_notes = [] for j in range(len(self.notes[i].notes)): new_notes.append( self.gamme.switch_note(self.notes[i].notes[j], self.bemol, qte).note) print("NOTESET") self.notes[i].show_noteset() print("NEW NOTESET : " + str(new_notes)) cpt = 0 for j in range(len(old_text) - 1): if old_text[j] != " " and old_text[j] != "/" and ( old_text[j] == self.notes[i].notes[cpt].note or (old_text[j] + old_text[j + 1]) == self.notes[i].notes[cpt].note): print("NEW NOTE") newer_text += new_notes[cpt] cpt += 1 if (cpt == len(new_notes)): newer_text += " " break else: if (old_text[j] != "b" and old_text[j] != "#"): newer_text += old_text[j] print("OLD TEXT : " + old_text) print("NEW TEXT : " + newer_text) new_S = text.P() new_S.setAttribute( "stylename", self.allText[self.lignes[i]].getAttribute("stylename")) new_S.addText(newer_text) self.allText[self.lignes[i]] = new_S # print("\n\n\n") # for i in range(len(self.allText)): # print(self.allText[i]) self.reset_notes()
def __init__(self, ligne): self.notes = [] self.gamme = Gamme() text = teletype.extractText(ligne) for i in range(len(text)): if text[i] in self.gamme.notes: new_note = text[i] if text[i + 1] == "b": new_note += "b" elif text[i + 1] == "#": new_note += "#" self.notes.append(Note(new_note))
def get_text_odt(file_path): """ Function to extract the text from an ODT file Args: file_path (str): local path to file Returns: String with all the text from the ODT file """ textdoc = load(file_path) full_text = [] for para in textdoc.getElementsByType(odf_text.P): full_text.append(teletype.extractText(para)) return ' '.join(full_text)
def read_files(p): filename, file_extension = os.path.splitext(p) if (file_extension in [".txt", ".py", ".cpp", ".c"]): with open(p, 'r') as file: data = file.read() return data elif (file_extension == ".pdf" ): ## TODO: needs tuning sometimes does not work #print("handling pdf:") data = [] t = "" with open(p, 'rb') as file: pdfReader = PyPDF2.PdfFileReader(file) for i in range(pdfReader.numPages): pageObj = pdfReader.getPage(i) data.append(pageObj.extractText()) t = '\n'.join(data) return (t) ## TODO: TEXTRACT STILL NOT FIXED ''' if (not len(text)): print("HERE") text = textract.process(p, method='tesseract', language='eng') ''' ''' ### Testing slate3k with open(p, 'rb') as file: extracted_text = slate.PDF(file) return(extracted_text) ''' elif (file_extension == ".docx"): #print("handling docx:") doc = docx.Document(p) data = [] for para in doc.paragraphs: data.append(para.text) return '\n'.join(data) elif (file_extension == ".odt"): #print("handling odt:") textdoc = load(p) data = [] allparas = textdoc.getElementsByType(text.P) for i in range(len(allparas)): data.append(teletype.extractText(allparas[i])) return '\n'.join(data)
def __fill_odf_table(a_file, a_tables_to_draw: List[TableToDraw]): for table in a_file.getElementsByType(odf_table.Table): for table_row in table.getElementsByType(odf_table.TableRow): if teletype.extractText(table_row) == "%insert_table__": cell_style = table_row.getElementsByType( odf_table.TableCell)[0].getAttribute("stylename") text_style = table_row.getElementsByType( odf_text.P)[0].getAttribute("stylename") row_length_in_cells = __get_table_columns_count(table) # Удаляем флаговую строку table_row.parentNode.removeChild(table_row) for table_to_draw in a_tables_to_draw: table_header = [ "Тип сигнала: " + table_to_draw.signal_type, "Предел измерения: " + table_to_draw.limit, "Допустимая погрешность: " + table_to_draw.error_limit ] __add_row_with_texts_to_table(table, None, cell_style, table_header, row_length_in_cells) for frequency in table_to_draw.points.keys(): if int(frequency) != 0: __add_row_with_text_to_table( table, None, cell_style, ' '.join(["Частота:", str(frequency), "Гц"]), row_length_in_cells) for points in table_to_draw.points[frequency]: points_row = __add_row_to_table(table) for point in points: __add_cell_to_row(points_row, cell_style, text_style, str(point)) for empty_cell in range(row_length_in_cells - len(points)): # Чтобы пустые ячейки не мерджились в одну __add_cell_to_row(points_row, cell_style, text_style, "") # Без этого дерево нодов сломается a_file.rebuild_caches(table_row.parentNode) break
def switch_type(self): for i in range(len(self.lignes)): old_text = teletype.extractText(self.allText[self.lignes[i]]) new_text = "" new_notes = [] cpt = 0 for j in range(len(self.notes[i].notes)): ind = self.get_char_index(self.notes[i].notes[j].note) if self.bemol: new_notes.append(self.gamme.notes_diese[ind]) else: new_notes.append(self.gamme.notes_bemol[ind]) print("NOTES : " + str(new_notes)) for j in range(len(old_text) - 1): if old_text[j] != " " and old_text[j] != "/" and ( old_text[j] == self.notes[i].notes[cpt].note or (old_text[j] + old_text[j + 1]) == self.notes[i].notes[cpt].note): print("NEW NOTE") new_text += new_notes[cpt] cpt += 1 if (cpt == len(new_notes)): new_text += " " break else: if (old_text[j] != "b" and old_text[j] != "#"): new_text += old_text[j] print("OLD TEXT : " + old_text) print("NEW TEXT : " + new_text) new_S = text.P() new_S.setAttribute( "stylename", self.allText[self.lignes[i]].getAttribute("stylename")) new_S.addText(new_text) self.allText[self.lignes[i]] = new_S self.bemol = not self.bemol self.reset_notes()
def get_text_from_odt(_odt, save_blank=None) -> str: """Выводит текст odt документа. Абзацы (\n), табуляции(\t), переносы строк (\n) обрабатываются. :param _odt: :param save_blank: Сохранять ли промежутки между абзацами как \n :return: text """ # TODO: save_blank - как в pandoc. doc = load(_odt) out_text = '' body = doc.body for _body_elem in body.childNodes: for _elem in _body_elem.childNodes: body_text = teletype.extractText(_elem) out_text += f'{body_text}\n' return out_text
def con(filenames, code, counter, subdir): if not os.path.exists('input/' + subdir): os.makedirs('input/' + subdir) for file in filenames: counter = counter + 1 txtString = [] textfile = load(file) allparas = textfile.getElementsByType(text.P) for texts in allparas: txtString.append(teletype.extractText(texts)) saveFile = open("input/" + subdir + '/' + str(code) + str(counter).zfill(4) + ".txt", 'w', encoding="utf-8") saveFile.write(''.join(txtString)) saveFile.close() return counter
def clean_file(file_path): doc = load(file_path) texts = doc.getElementsByType(text.P) s = len(texts) started = False line_items = [] for i in range(s): line = teletype.extractText(texts[i]) if 'END OF MINIDISCS' in line: break if not started and 'DISC 1' in line: started = True if started and line and not any(a_word in line for a_word in skip_words): line_items.append(line) return line_items
def __init__(self, ligne): self.notes = [] self.gamme = Gamme() text = teletype.extractText(ligne) for i, txt in enumerate(text): if txt in self.gamme.notes: new_note = txt if text[i + 1] == "b": new_note += "b" elif text[i + 1] == "#": new_note += "#" self.notes.append(Note(new_note))
def main(): global grid global month_year if len(sys.argv)==1: cmd_line="ncal -h" else: cmd_line="ncal -h -m %s" %(sys.argv[1]) cal_prc=subprocess.Popen(cmd_line.split(' '),stdout=subprocess.PIPE) cal_str_lst_tr=cal_prc.communicate()[0].split('\n') cal_str_lst=[cal_str_lst_tr[0]] for c in xrange(7): line=u"" for l in xrange(7): st=(cal_str_lst_tr[l+1].decode("utf-8")+u" ")[c*3:(c*3+3)] line=line+st cal_str_lst.append(line) month_year=cal_str_lst[0].strip() header=cal_str_lst[1].split(' ') header=[i for i in header if len(i)>0] for n in cal_str_lst: ned=[] if (len(n)>20): for d in xrange(7): day=''+n[3*d]+n[(3*d)+1] ned.append(day) grid.append(ned) grid=grid[2:] cal=load("template_ru.ods") texts = cal.getElementsByType(text.P) s=len(texts) for i in range(s): old_text = teletype.extractText(texts[i]).encode("utf-8") new_text =replace_tmpl(old_text) if new_text!=None: new_S = text.P() new_S.setAttribute("stylename",texts[i].getAttribute("stylename")) new_S.addText(new_text) texts[i].parentNode.insertBefore(new_S,texts[i]) texts[i].parentNode.removeChild(texts[i]) cal.save("календарь на %s.ods" %(month_year))
def read_file(self): odtfile = load(self.path) texts = odtfile.getElementsByType(text.P) self.content = " ".join(teletype.extractText(t) for t in texts)